# ===== HELPER FUNCTIONS ==========================================================================

#! Asserts that both values at the top of the stack are u64 values.
#! The input values are assumed to be represented using 32 bit limbs, fails if they are not.
#! This takes 6 cycles.
proc.u32assert4
    u32assert2
    movup.3
    movup.3
    u32assert2
    movup.3
    movup.3
end

# ===== ADDITION ==================================================================================

#! Performs addition of two unsigned 64 bit integers preserving the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [overflowing_flag, c_hi, c_lo, ...], where c = (a + b) % 2^64
#! This takes 6 cycles.
export.overflowing_add
    swap
    movup.3
    u32overflowing_add
    movup.3
    movup.3
    u32overflowing_add3
end

#! Performs addition of two unsigned 64 bit integers discarding the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = (a + b) % 2^64
#! This takes 7 cycles.
export.wrapping_add
    exec.overflowing_add
    drop
end

# ===== SUBTRACTION ===============================================================================

#! Performs subtraction of two unsigned 64 bit integers discarding the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = (a - b) % 2^64
#! This takes 10 cycles.
export.wrapping_sub
    movup.3
    movup.2
    u32overflowing_sub
    movup.3
    movup.3
    u32overflowing_sub
    drop
    swap
    u32overflowing_sub
    drop
end

#! Performs subtraction of two unsigned 64 bit integers preserving the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [underflowing_flag, c_hi, c_lo, ...], where c = (a - b) % 2^64
#! This takes 11 cycles.
export.overflowing_sub
    movup.3
    movup.2
    u32overflowing_sub
    movup.3
    movup.3
    u32overflowing_sub
    swap
    movup.2
    u32overflowing_sub
    movup.2
    or
end

# ===== MULTIPLICATION ============================================================================

#! Performs multiplication of two unsigned 64 bit integers discarding the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = (a * b) % 2^64
#! This takes 11 cycles.
export.wrapping_mul
    dup.3
    dup.2
    u32overflowing_mul
    movup.4
    movup.4
    u32overflowing_madd
    drop
    movup.3
    movup.3
    u32overflowing_madd
    drop
end

#! Performs multiplication of two unsigned 64 bit integers preserving the overflow.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_mid_hi, c_mid_lo, c_lo, ...], where c = (a * b) % 2^64
#! This takes 18 cycles.
export.overflowing_mul
    dup.3
    dup.2
    u32overflowing_mul
    dup.4
    movup.4
    u32overflowing_madd
    swap
    movup.5
    dup.4
    u32overflowing_madd
    movup.5
    movup.5
    u32overflowing_madd
    movup.3
    movup.2
    u32overflowing_add
    movup.2
    add
end

# ===== COMPARISONS ===============================================================================

#! Performs less-than comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a < b, and 0 otherwise.
#! This takes 11 cycles.
export.lt
    movup.3
    movup.2
    u32overflowing_sub
    movdn.3
    drop
    u32overflowing_sub
    swap
    eq.0
    movup.2
    and
    or
end

#! Performs greater-than comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a > b, and 0 otherwise.
#! This takes 11 cycles.
export.gt
    movup.2
    u32overflowing_sub
    movup.2
    movup.3
    u32overflowing_sub
    swap
    drop
    movup.2
    eq.0
    and
    or
end

#! Performs less-than-or-equal comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a <= b, and 0 otherwise.
#! This takes 12 cycles.
export.lte
    exec.gt
    not
end

#! Performs greater-than-or-equal comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a >= b, and 0 otherwise.
#! This takes 12 cycles.
export.gte
    exec.lt
    not
end

#! Performs equality comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a == b, and 0 otherwise.
#! This takes 6 cycles.
export.eq
    movup.2
    eq
    swap
    movup.2
    eq
    and
end

#! Performs inequality comparison of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c, ...], where c = 1 when a != b, and 0 otherwise.
#! This takes 6 cycles.
export.neq
    movup.2
    neq
    swap
    movup.2
    neq
    or
end

#! Performs comparison to zero of an unsigned 64 bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [a_hi, a_lo, ...] -> [c, ...], where c = 1 when a == 0, and 0 otherwise.
#! This takes 4 cycles.
export.eqz
    eq.0
    swap
    eq.0
    and
end

#! Compares two unsigned 64 bit integers and drop the larger one from the stack.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a when a < b, and b otherwise.
#! This takes 23 cycles.
export.min
    dupw
    exec.gt
    movup.4
    movup.3
    dup.2
    cdrop
    movdn.3
    cdrop
end

#! Compares two unsigned 64 bit integers and drop the smaller one from the stack.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a when a > b, and b otherwise.
#! This takes 23 cycles.
export.max
    dupw
    exec.lt
    movup.4
    movup.3
    dup.2
    cdrop
    movdn.3
    cdrop
end

# ===== DIVISION ==================================================================================

#! Performs division of two unsigned 64 bit integers discarding the remainder.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a // b
#! This takes 54 cycles.
export.div
    adv.push_u64div     # push the quotient and the remainder onto the advice stack

    adv_push.2          # pop the quotient from the advice stack and assert it consists of
    u32assert2         # 32-bit limbs

    dup.3               # multiply quotient by the divisor and make sure the resulting value
    dup.2               # fits into 2 32-bit limbs
    u32overflowing_mul
    dup.4
    dup.4
    u32overflowing_madd
    eq.0
    assert
    dup.5
    dup.3
    u32overflowing_madd
    eq.0
    assert
    dup.4
    dup.3
    mul
    eq.0
    assert

    adv_push.2          # pop the remainder from the advice stack and assert it consists of
    u32assert2         # 32-bit limbs

    movup.7             # make sure the divisor is greater than the remainder. this also consumes
    movup.7             # the divisor
    dup.3
    dup.3
    exec.gt
    assert

    swap                # add remainder to the previous result; this also consumes the remainder
    movup.3
    u32overflowing_add
    movup.3
    movup.3
    u32overflowing_add3
    eq.0
    assert

    movup.4             # make sure the result we got is equal to the dividend
    assert_eq
    movup.3
    assert_eq           # quotient remains on the stack
end

# ===== MODULO OPERATION ==========================================================================

#! Performs modulo operation of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a % b
#! This takes 54 cycles.
export.mod
    adv.push_u64div     # push the quotient and the remainder onto the advice stack

    adv_push.2          # pop the quotient from the advice stack and assert it consists of
    u32assert2         # 32-bit limbs

    dup.3               # multiply quotient by the divisor and make sure the resulting value
    dup.2               # fits into 2 32-bit limbs
    u32overflowing_mul
    dup.4
    movup.4
    u32overflowing_madd
    eq.0
    assert
    dup.4
    dup.3
    u32overflowing_madd
    eq.0
    assert
    dup.3
    movup.3
    mul
    eq.0
    assert

    adv_push.2          # pop the quotient from the advice stack and assert it consists of
    u32assert2         # 32-bit limbs

    movup.5             # make sure the divisor is greater than the remainder. this also consumes
    movup.5             # the divisor
    dup.3
    dup.3
    exec.gt
    assert

    dup.1               # add remainder to the previous result
    movup.4
    u32overflowing_add
    movup.4
    dup.3
    u32overflowing_add3
    eq.0
    assert

    movup.4             # make sure the result we got is equal to the dividend
    assert_eq
    movup.3
    assert_eq           # remainder remains on the stack
end

# ===== DIVMOD OPERATION ==========================================================================

#! Performs divmod operation of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [r_hi, r_lo, q_hi, q_lo ...], where r = a % b, q = a / b
#! This takes 54 cycles.
export.divmod
    adv.push_u64div     # push the quotient and the remainder onto the advice stack

    adv_push.2          # pop the quotient from the advice stack and assert it consists of
    u32assert2         # 32-bit limbs

    dup.3               # multiply quotient by the divisor and make sure the resulting value
    dup.2               # fits into 2 32-bit limbs
    u32overflowing_mul
    dup.4
    dup.4
    u32overflowing_madd
    eq.0
    assert
    dup.5
    dup.3
    u32overflowing_madd
    eq.0
    assert
    dup.4
    dup.3
    mul
    eq.0
    assert

    adv_push.2          # pop the quotient from the advice stack and assert it consists of
    u32assert2         # 32-bit limbs

    movup.7             # make sure the divisor is greater than the remainder. this also consumes
    movup.7             # the divisor
    dup.3
    dup.3
    exec.gt
    assert

    dup.1               # add remainder to the previous result
    movup.4
    u32overflowing_add
    movup.4
    dup.3
    u32overflowing_add3
    eq.0
    assert

    movup.6             # make sure the result we got is equal to the dividend
    assert_eq
    movup.5
    assert_eq           # remainder remains on the stack
end

# ===== BITWISE OPERATIONS ========================================================================

#! Performs bitwise AND of two unsigned 64-bit integers.
#! The input values are assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a AND b.
#! This takes 6 cycles.
export.and
    swap
    movup.3
    u32and
    swap
    movup.2
    u32and
end

#! Performs bitwise OR of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, fails if they are not.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a OR b.
#! This takes 16 cycles.
export.or
    swap
    movup.3
    u32or
    swap
    movup.2
    u32or
end

#! Performs bitwise XOR of two unsigned 64 bit integers.
#! The input values are assumed to be represented using 32 bit limbs, fails if they are not.
#! Stack transition looks as follows:
#! [b_hi, b_lo, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a XOR b.
#! This takes 6 cycles.
export.xor
    swap
    movup.3
    u32xor
    swap
    movup.2
    u32xor
end

#! Performs left shift of one unsigned 64-bit integer using the pow2 operation.
#! The input value to be shifted is assumed to be represented using 32 bit limbs.
#! The shift value should be in the range [0, 64), otherwise it will result in an
#! error.
#! Stack transition looks as follows:
#! [b, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a << b mod 2^64.
#! This takes 28 cycles.
export.shl
    pow2
    u32split
    exec.wrapping_mul
end


#! Performs right shift of one unsigned 64-bit integer using the pow2 operation.
#! The input value to be shifted is assumed to be represented using 32 bit limbs.
#! The shift value should be in the range [0, 64), otherwise it will result in an
#! error.
#! Stack transition looks as follows:
#! [b, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a >> b.
#! This takes 44 cycles.
export.shr
    pow2
    u32split

    dup.1
    add
    movup.2
    swap
    u32divmod
    movup.3
    movup.3
    dup
    eq.0
    u32overflowing_sub
    not
    movdn.4
    dup
    movdn.4
    u32divmod
    drop
    push.4294967296
    dup.5
    mul
    movup.4
    div
    movup.2
    mul
    add
    movup.2
    cswap
end

#! Performs left rotation of one unsigned 64-bit integer using the pow2 operation.
#! The input value to be shifted is assumed to be represented using 32 bit limbs.
#! The shift value should be in the range [0, 64), otherwise it will result in an
#! error.
#! Stack transition looks as follows:
#! [b, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a << b mod 2^64.
#! This takes 35 cycles.
export.rotl
    push.31
    dup.1
    u32overflowing_sub
    swap
    drop
    movdn.3

    # Shift the low limb.
    push.31
    u32and
    pow2
    dup
    movup.3
    u32overflowing_mul

    # Shift the high limb.
    movup.3
    movup.3
    u32overflowing_madd

    # Carry the overflow shift to the low bits.
    movup.2
    add
    swap

    # Conditionally select the limb order based on whether it's shifting by > 31 or not.
    movup.2
    cswap
end

#! Performs right rotation of one unsigned 64-bit integer using the pow2 operation.
#! The input value to be shifted is assumed to be represented using 32 bit limbs.
#! The shift value should be in the range [0, 64), otherwise it will result in an
#! error.
#! Stack transition looks as follows:
#! [b, a_hi, a_lo, ...] -> [c_hi, c_lo, ...], where c = a << b mod 2^64.
#! This takes 44 cycles.
export.rotr
    push.31
    dup.1
    u32lt
    movdn.3

    # Shift the low limb left by 32-b.
    push.31
    u32and
    push.32
    swap
    u32wrapping_sub
    pow2
    dup
    movup.3
    mul
    u32split

    # Shift the high limb left by 32-b.
    movup.3
    movup.3
    mul
    add
    u32split

    # Carry the overflow shift to the low bits.
    movup.2
    add
    swap

    # Conditionally select the limb order based on whether it's shifting by > 31 or not.
    movup.2
    not
    cswap
end

#! Counts the number of leading zeros of one unsigned 64-bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [n_hi, n_lo, ...] -> [clz, ...], where clz is a number of leading zeros of value n.
#! This takes 48 cycles.
export.clz
    dup.0
    eq.0

    if.true    # if n_hi == 0
        drop
        u32clz
        add.32 # clz(n_lo) + 32
    else
        swap
        drop
        u32clz # clz(n_hi)
    end
end

#! Counts the number of trailing zeros of one unsigned 64-bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [n_hi, n_lo, ...] -> [ctz, ...], where ctz is a number of trailing zeros of value n.
#! This takes 41 cycles.
export.ctz
    swap
    dup.0
    eq.0

    if.true    # if n_lo == 0
        drop
        u32ctz
        add.32 # ctz(n_hi) + 32
    else
        swap
        drop
        u32ctz # ctz(n_lo)
    end
end

#! Counts the number of leading ones of one unsigned 64-bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [n_hi, n_lo, ...] -> [clo, ...], where clo is a number of leading ones of value n.
#! This takes 47 cycles.
export.clo
    dup.0
    eq.4294967295

    if.true    # if n_hi == 11111111111111111111111111111111
        drop
        u32clo
        add.32 # clo(n_lo) + 32
    else
        swap
        drop
        u32clo # clo(n_hi)
    end
end

#! Counts the number of trailing ones of one unsigned 64-bit integer.
#! The input value is assumed to be represented using 32 bit limbs, but this is not checked.
#! Stack transition looks as follows:
#! [n_hi, n_lo, ...] -> [cto, ...], where cto is a number of trailing ones of value n.
#! This takes 40 cycles.
export.cto
    swap
    dup.0
    eq.4294967295

    if.true    # if n_lo == 11111111111111111111111111111111
        drop
        u32cto
        add.32 # cto(n_hi) + 32
    else
        swap
        drop
        u32cto # ctz(n_lo)
    end
end