#! Given [b, c, a, carry] on stack top, following function computes #! #! tmp = a + (b * c) + carry #! hi = tmp >> 32 #! lo = tmp & 0xffff_ffff #! return (hi, lo) #! #! At end of execution of this function, stack top should look like [hi, lo] #! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/scalar_field_utils.py#L41-L46 proc.mac u32overflowing_madd movdn.2 u32overflowing_add movup.2 add end #! Given [a, b, borrow] on stack top, following function computes #! #! tmp = a - (b + borrow) #! hi = tmp >> 32 #! lo = tmp & 0xffff_ffff #! return (hi, lo) #! #! At end of execution of this function, stack top should look like [hi, lo] #! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/scalar_field_utils.py#L49-L55 proc.sbb movdn.2 add u32overflowing_sub end #! Given a secp256k1 scalar field element in radix-2^32 representation ( Montgomery form ) #! and 32 -bit unsigned integer, this routine computes a 288 -bit number. #! #! Input via stack is expected in this form #! #! [a0, a1, a2, a3, a4, a5, a6, a7, b, ...] | a[0..8] -> 256 -bit number, b = 32 -bit number #! #! Computed output looks like below, on stack #! #! [carry, b7, b6, b5, b4, b3, b2, b1, b0, ...] #! #! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/scalar_field_utils.py#L65-L83 proc.u256xu32 movup.8 push.0 dup.1 movup.3 u32overflowing_madd dup.2 movup.4 u32overflowing_madd dup.3 movup.5 u32overflowing_madd dup.4 movup.6 u32overflowing_madd dup.5 movup.7 u32overflowing_madd dup.6 movup.8 u32overflowing_madd dup.7 movup.9 u32overflowing_madd movup.8 movup.9 u32overflowing_madd end #! Given a 288 -bit number and 256 -bit number on stack ( in order ), this routine #! computes a 288 -bit number, by adding the 256 -bit number to other operand #! #! Expected stack state during routine invocation #! #! [carry, b7, b6, b5, b4, b3, b2, b1, b0, c0, c1, c2, c3, c4, c5, c6, c7, ...] #! #! While after execution of this routine, stack should look like #! #! [d0, d1, d2, d3, d4, d5, d6, d7, carry, ...] #! #! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/scalar_field_utils.py#L85-L98 proc.u288_add_u256 swapw movupw.2 u32overflowing_add movup.2 movup.7 u32overflowing_add3 movup.3 movup.6 u32overflowing_add3 movup.4 movup.5 movupw.2 movup.2 movup.4 movup.6 u32overflowing_add3 movup.5 movup.5 u32overflowing_add3 movup.3 movup.4 movupw.2 movup.2 movup.4 movup.6 u32overflowing_add3 movup.5 movup.5 u32overflowing_add3 movup.10 movup.5 u32overflowing_add3 movup.4 add swap movup.2 movup.3 movup.4 movup.5 movup.6 movup.7 movup.8 end #! Given [c0, c1, c2, c3, c4, c5, c6, c7, c8, pc, ...] on stack top, #! this function attempts to reduce 288 -bit number to a 256 -bit number #! along with carry, using montgomery reduction method. The modulo to which this #! reduction is performed is secp256k1 scalar field prime. #! #! Find secp256k1 scalar field prime https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/scalar_field_consts.py#L19-L21 #! #! In stack top content c[0..9] i.e. first 9 elements, holding 288 -bit #! number. Stack element `pc` ( at stack[9] ) is previous reduction round's #! carry ( for first reduction round, it'll be set to 0 ). #! #! After finishing execution of this function, stack top should look like #! #! [c0, c1, c2, c3, c4, c5, c6, c7, pc, ...] | pc = next round's carry #! #! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/scalar_field_utils.py#L118-L126 proc.u288_reduce dup push.1435021631 u32wrapping_mul # q at stack top # push.0 movup.2 push.3493216577 dup.3 exec.mac swap drop movup.2 push.3218235020 dup.3 exec.mac movup.3 push.2940772411 dup.4 exec.mac movup.4 push.3132021990 dup.5 exec.mac movup.5 push.4294967294 dup.6 exec.mac movup.6 push.4294967295 dup.7 exec.mac movup.7 dup.7 push.4294967295 exec.mac movup.7 movup.8 swap push.4294967295 exec.mac movup.9 movup.9 u32overflowing_add3 swap movup.2 movup.3 movup.4 movup.5 movup.6 movup.7 movup.8 end #! Given two 256 -bit numbers ( elements belonging to secp256k1 scalar field ) on stack, #! where each number is represented in radix-2^32 form ( i.e. each number having eight #! 32 -bit limbs ), following function computes modular multiplication of those two #! operands, computing 256 -bit result, which belongs to secp256k1 scalar field. #! #! Stack expected as below, holding input #! #! [a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7, ...] | a[0..8], b[0..8] are 256 -bit numbers #! #! After finishing execution of this function, stack should look like #! #! [c0, c1, c2, c3, c4, c5, c6, c7, ...] | c[0..8] is a 256 -bit number #! #! Note, for computing modular multiplication of a[0..8] & b[0..8], #! school book multiplication equipped with Montgomery reduction technique #! is used, which is why a[0..8], b[0..8] are expected to be in Montgomery form, #! while computed c[0..8] will also be in Montgomery form. #! #! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/scalar_field_utils.py#L101-L225 export.mul.2 loc_storew.0 swapw loc_storew.1 swapw exec.u256xu32 swap movup.2 movup.3 movup.4 movup.5 movup.6 movup.7 movup.8 push.0 movdn.9 exec.u288_reduce movup.9 push.0.0.0.0 loc_loadw.1 push.0.0.0.0 loc_loadw.0 exec.u256xu32 exec.u288_add_u256 exec.u288_reduce movup.9 push.0.0.0.0 loc_loadw.1 push.0.0.0.0 loc_loadw.0 exec.u256xu32 exec.u288_add_u256 exec.u288_reduce movup.9 push.0.0.0.0 loc_loadw.1 push.0.0.0.0 loc_loadw.0 exec.u256xu32 exec.u288_add_u256 exec.u288_reduce movup.9 push.0.0.0.0 loc_loadw.1 push.0.0.0.0 loc_loadw.0 exec.u256xu32 exec.u288_add_u256 exec.u288_reduce movup.9 push.0.0.0.0 loc_loadw.1 push.0.0.0.0 loc_loadw.0 exec.u256xu32 exec.u288_add_u256 exec.u288_reduce movup.9 push.0.0.0.0 loc_loadw.1 push.0.0.0.0 loc_loadw.0 exec.u256xu32 exec.u288_add_u256 exec.u288_reduce movup.9 push.0.0.0.0 loc_loadw.1 push.0.0.0.0 loc_loadw.0 exec.u256xu32 exec.u288_add_u256 exec.u288_reduce movup.8 # carry bit if.true push.801750719 push.0 u32overflowing_add3 movup.2 push.1076732275 u32overflowing_add3 movup.3 push.1354194884 u32overflowing_add3 movup.4 push.1162945305 u32overflowing_add3 movup.5 push.1 u32overflowing_add3 movup.6 push.0 u32overflowing_add3 movup.7 push.0 u32overflowing_add3 movup.8 push.0 u32wrapping_add3 swap movup.2 movup.3 movup.4 movup.5 movup.6 movup.7 end end #! Just a wrapper function for ease of squaring an element of secp256k1 scalar field. #! #! Expected stack state #! #! [a0, a1, a2, a3, a4, a5, a6, a7, ...] a[0..8] is a 256 -bit number #! #! Final stack state #! #! [b0, b1, b2, b3, b4, b5, b6, b7, ...] b[0..8] is a 256 -bit number s.t. b = a * a proc.sqr dupw.1 dupw.1 exec.mul end #! Given an element of secp256k1 scalar field, represented in Montgomery form i.e. eight 32 -bit limbs, #! this routine computes radix-2^32 representation of provided u256 number. #! #! Stack expected as #! #! [a0, a1, a2, a3, a4, a5, a6, a7, ...] #! #! Final stack should look like #! #! [a0`, a1`, a2`, a3`, a4`, a5`, a6`, a7`, ...] #! #! See section 2.2 of https://eprint.iacr.org/2017/1057.pdf #! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/scalar_field_utils.py#L238-L244 #! for implementation export.from_mont push.0.0.0.0.0.0.0.1 # pushed 1's radix-2^32 form exec.mul end #! Given an element ( say a ) of secp256k1 scalar field, this routine computes multiplicative #! inverse ( say a' ) of that element s.t. a * a' = 1 ( mod p ) | p = secp256k1 scalar field prime #! #! Expected stack state #! #! [a0, a1, a2, a3, a4, a5, a6, a7, ...] a[0..8] is a 256 -bit number #! #! Final stack state #! #! [b0, b1, b2, b3, b4, b5, b6, b7, ...] b[0..8] is a 256 -bit number s.t. b = a^-1 ( mod p ) #! #! Note, both input and output stays in Montgomery form. If 0 is input operand, then multiplicative #! inverse can't be computed, which is why output result is also 0. #! #! See https://github.com/itzmeanjan/secp256k1/blob/37b339db3e03d24c2977399eb8896ef515ebb09b/field/scalar_field.py#L118-L136 export.inv.4 # cache result initial value ( = 1, in Montgomery form ) push.0.0.0.1.1162945305.1354194884.1076732275.801750719 loc_storew.0 dropw loc_storew.1 dropw # cache base loc_storew.2 dropw loc_storew.3 dropw push.3493216575.3218235020.2940772411.3132021990.4294967294.4294967295.4294967295.4294967295 repeat.8 repeat.32 push.0.0.0.0.0.0.0.0 loc_loadw.1 swapw loc_loadw.0 exec.sqr loc_storew.0 dropw loc_storew.1 dropw dup u32shr.31 if.true push.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0 loc_loadw.3 swapw loc_loadw.2 swapdw loc_loadw.1 swapw loc_loadw.0 exec.mul loc_storew.0 dropw loc_storew.1 dropw end u32shl.1 end drop end push.0.0.0.0.0.0.0.0 loc_loadw.1 swapw loc_loadw.0 end