#! Given [b, c, a, carry] on stack top, following function computes
#!
#!  tmp = a + (b * c) + carry
#!  hi = tmp >> 32
#!  lo = tmp & 0xffff_ffff
#!  return (hi, lo)
#!
#! At end of execution of this function, stack top should look like [hi, lo]
#! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field_utils.py#L41-L46
proc.mac
  u32overflowing_madd

  movdn.2
  u32overflowing_add

  movup.2
  add
end

#! Given [a, b, borrow] on stack top, following function computes
#!
#!  tmp = a - (b + borrow)
#!  hi = tmp >> 32
#!  lo = tmp & 0xffff_ffff
#!  return (hi, lo)
#!
#! At end of execution of this function, stack top should look like [hi, lo]
#! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field_utils.py#L49-L55
proc.sbb
  movdn.2
  add
  u32overflowing_sub
end

#! Given a secp256k1 base field element in radix-2^32 representation ( Montgomery form )
#! and 32 -bit unsigned integer, this routine computes a 288 -bit number.
#!
#! Input via stack is expected in this form
#!
#! [a0, a1, a2, a3, a4, a5, a6, a7, b, ...] | a[0..8] -> 256 -bit number, b = 32 -bit number
#!
#! Computed output looks like below, on stack
#!
#! [carry, b7, b6, b5, b4, b3, b2, b1, b0, ...]
#!
#! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field_utils.py#L65-L83
proc.u256xu32
  movup.8

  push.0
  dup.1
  movup.3
  u32overflowing_madd

  dup.2
  movup.4
  u32overflowing_madd

  dup.3
  movup.5
  u32overflowing_madd

  dup.4
  movup.6
  u32overflowing_madd

  dup.5
  movup.7
  u32overflowing_madd

  dup.6
  movup.8
  u32overflowing_madd

  dup.7
  movup.9
  u32overflowing_madd

  movup.8
  movup.9
  u32overflowing_madd
end

#! Given a 288 -bit number and 256 -bit number on stack ( in order ), this routine
#! computes a 288 -bit number, by adding the 256 -bit number to other operand
#!
#! Expected stack state during routine invocation
#!
#! [carry, b7, b6, b5, b4, b3, b2, b1, b0, c0, c1, c2, c3, c4, c5, c6, c7, ...]
#!
#! While after execution of this routine, stack should look like
#!
#! [d0, d1, d2, d3, d4, d5, d6, d7, carry, ...]
#!
#! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field_utils.py#L85-L98
proc.u288_add_u256
  swapw
  movupw.2

  u32overflowing_add

  movup.2
  movup.7
  u32overflowing_add3

  movup.3
  movup.6
  u32overflowing_add3

  movup.4
  movup.5
  movupw.2

  movup.2
  movup.4
  movup.6
  u32overflowing_add3

  movup.5
  movup.5
  u32overflowing_add3

  movup.3
  movup.4
  movupw.2

  movup.2
  movup.4
  movup.6
  u32overflowing_add3

  movup.5
  movup.5
  u32overflowing_add3

  movup.10
  movup.5
  u32overflowing_add3

  movup.4
  add

  swap
  movup.2
  movup.3
  movup.4
  movup.5
  movup.6
  movup.7
  movup.8
end

#! Given [c0, c1, c2, c3, c4, c5, c6, c7, c8, pc, ...] on stack top,
#! this function attempts to reduce 288 -bit number to a 256 -bit number
#! along with carry, using montgomery reduction method. The modulo to which this
#! reduction is performed is secp256k1 base field prime.
#!
#! Find secp256k1 base field prime https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field_consts.py#L19-L21
#!
#! In stack top content c[0..9] i.e. first 9 elements, holding 288 -bit
#! number. Stack element `pc` ( at stack[9] ) is previous reduction round's
#! carry ( for first reduction round, it'll be set to 0 ).
#!
#! After finishing execution of this function, stack top should look like
#!
#! [c0, c1, c2, c3, c4, c5, c6, c7, pc, ...] | pc = next round's carry
#!
#! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field_utils.py#L118-L126
proc.u288_reduce
  dup
  push.3525653809
  u32wrapping_mul
  # q at stack top #

  push.0
  movup.2
  push.4294966319
  dup.3
  exec.mac

  swap
  drop

  movup.2
  push.4294967294
  dup.3
  exec.mac

  movup.3
  push.4294967295
  dup.4
  exec.mac

  movup.4
  push.4294967295
  dup.5
  exec.mac

  movup.5
  push.4294967295
  dup.6
  exec.mac

  movup.6
  push.4294967295
  dup.7
  exec.mac

  movup.7
  dup.7
  push.4294967295
  exec.mac

  movup.7
  movup.8
  swap
  push.4294967295
  exec.mac

  movup.9
  movup.9
  u32overflowing_add3

  swap
  movup.2
  movup.3
  movup.4
  movup.5
  movup.6
  movup.7
  movup.8
end

#! Given two 256 -bit numbers ( elements belonging to secp256k1 base field ) on stack,
#! where each number is represented in radix-2^32 form ( i.e. each number having eight
#! 32 -bit limbs ), following function computes modular multiplication of those two
#! operands, computing 256 -bit result, which belongs to secp256k1 base field.
#!
#! Stack expected as below, holding input
#!
#! [a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7, ...] | a[0..8], b[0..8] are 256 -bit numbers
#!
#! After finishing execution of this function, stack should look like
#!
#! [c0, c1, c2, c3, c4, c5, c6, c7, ...] | c[0..8] is a 256 -bit number
#!
#! Note, for computing modular multiplication of a[0..8] & b[0..8],
#! school book multiplication equipped with Montgomery reduction technique
#! is used, which is why a[0..8], b[0..8] are expected to be in Montgomery form,
#! while computed c[0..8] will also be in Montgomery form.
#!
#! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field_utils.py#L101-L222
export.mul.2
  loc_storew.0
  swapw
  loc_storew.1
  swapw

  exec.u256xu32

  swap
  movup.2
  movup.3
  movup.4
  movup.5
  movup.6
  movup.7
  movup.8

  push.0
  movdn.9

  exec.u288_reduce

  movup.9
  push.0.0.0.0
  loc_loadw.1
  push.0.0.0.0
  loc_loadw.0

  exec.u256xu32
  exec.u288_add_u256
  exec.u288_reduce

  movup.9
  push.0.0.0.0
  loc_loadw.1
  push.0.0.0.0
  loc_loadw.0

  exec.u256xu32
  exec.u288_add_u256
  exec.u288_reduce

  movup.9
  push.0.0.0.0
  loc_loadw.1
  push.0.0.0.0
  loc_loadw.0

  exec.u256xu32
  exec.u288_add_u256
  exec.u288_reduce

  movup.9
  push.0.0.0.0
  loc_loadw.1
  push.0.0.0.0
  loc_loadw.0

  exec.u256xu32
  exec.u288_add_u256
  exec.u288_reduce

  movup.9
  push.0.0.0.0
  loc_loadw.1
  push.0.0.0.0
  loc_loadw.0

  exec.u256xu32
  exec.u288_add_u256
  exec.u288_reduce

  movup.9
  push.0.0.0.0
  loc_loadw.1
  push.0.0.0.0
  loc_loadw.0

  exec.u256xu32
  exec.u288_add_u256
  exec.u288_reduce

  movup.9
  push.0.0.0.0
  loc_loadw.1
  push.0.0.0.0
  loc_loadw.0

  exec.u256xu32
  exec.u288_add_u256
  exec.u288_reduce

  movup.8
  movup.2
  dup.1
  add

  movup.2
  movup.2
  push.977

  u32overflowing_madd
  drop
end

#! Just a wrapper function for ease of squaring an element of secp256k1 base field.
#!
#! Expected stack state
#!
#! [a0, a1, a2, a3, a4, a5, a6, a7, ...] | a[0..8] is a 256 -bit number
#!
#! Final stack state
#!
#! [b0, b1, b2, b3, b4, b5, b6, b7, ...] | b[0..8] is a 256 -bit number s.t. b = a * a
proc.sqr
  dupw.1
  dupw.1

  exec.mul
end

#! Given two 256 -bit numbers ( elements belonging to secp256k1 base field ) on stack,
#! where each number is represented in radix-2^32 form ( i.e. each number having eight
#! 32 -bit limbs ), following function computes modular addition of those two operands,
#! in secp256k1 base field.
#!
#! Stack expected as below, holding input
#!
#! [a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7, ...] | a[0..8], b[0..8] are 256 -bit numbers
#!
#! After finishing execution of this function, stack should look like
#!
#! [c0, c1, c2, c3, c4, c5, c6, c7, ...] | c[0..8] is a 256 -bit number
#!
#! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field.py#L57-L76
export.add
  movupw.2

  push.0
  movup.5
  u32overflowing_add3

  movup.2
  movup.5
  u32overflowing_add3

  movup.3
  movup.5
  u32overflowing_add3

  movup.4
  movup.5
  u32overflowing_add3

  movup.5
  movup.9
  u32overflowing_add3

  movup.6
  movup.9
  u32overflowing_add3

  movup.7
  movup.9
  u32overflowing_add3

  movup.8
  movup.9
  u32overflowing_add3

  movup.8
  dup.1
  push.977
  u32overflowing_madd
  drop

  swap
  movup.8
  add

  movup.2
  movup.3
  movup.4
  movup.5
  movup.6
  movup.7
  movup.6
  movup.7
end

#! Given a secp256k1 base field element ( say a ) on stack, represented in Montgomery form
#! ( i.e. number having eight 32 -bit limbs ), following function negates it to
#! field element a' | a' + a = 0
#!
#! Stack expected as below, holding input
#!
#! [a0, a1, a2, a3, a4, a5, a6, a7, ...] | a[0..8] is a secp256k1 base field element
#!
#! After finishing execution of this function, stack should look like
#!
#! [c0, c1, c2, c3, c4, c5, c6, c7, ...] | c[0..8] is a secp256k1 base field element
#!
#! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field.py#L78-L96
export.neg
  push.0
  swap
  push.4294966319
  exec.sbb

  movup.2
  push.4294967294
  exec.sbb

  movup.3
  push.4294967295
  exec.sbb

  movup.4
  push.4294967295
  exec.sbb

  movup.5
  push.4294967295
  exec.sbb

  movup.6
  push.4294967295
  exec.sbb

  movup.7
  push.4294967295
  exec.sbb

  movup.8
  push.4294967295
  exec.sbb

  drop

  swap
  movup.2
  movup.3
  movup.4
  movup.5
  movup.6
  movup.7
end

#! Given two secp256k1 base field elements, say a, b, ( represented in Montgomery form,
#! each number having eight 32 -bit limbs ) on stack, following function computes modular
#! subtraction of those two operands c = a + (-b) = a - b
#!
#! Stack expected as below, holding input
#!
#! [a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7, ...] | a[0..8], b[0..8] are secp256k1 base field elements
#!
#! After finishing execution of this function, stack should look like
#!
#! [c0, c1, c2, c3, c4, c5, c6, c7, ...] | c[0..8] is a secp256k1 base field element
#!
#! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field.py#L98-L102
export.sub
  movupw.3
  movupw.3

  exec.neg
  exec.add
end

#! Given a 256 -bit number on stack, represented in radix-2^32 form i.e. eight 32 -bit limbs,
#! this routine computes Montgomery representation of provided radix-2^32 number.
#!
#! Stack expected in form
#!
#!  [a0, a1, a2, a3, a4, a5, a6, a7, ...]
#!
#! Final stack should look like
#!
#! [a0', a1', a2', a3', a4', a5', a6', a7', ...]
#!
#! See section 2.2 of https://eprint.iacr.org/2017/1057.pdf
#! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field_utils.py#L225-L232
#! for implementation
export.to_mont
  push.0.0.0.0
  push.0.1.1954.954529 # pushed R2's radix-2^32 form;
                       # see https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field_consts.py#L31

  exec.mul
end

#! Given a 256 -bit number on stack, represented in Montgomery form i.e. eight 32 -bit limbs,
#! this routine computes radix-2^32 representation of provided u256 number.
#!
#! Stack expected as
#!
#!  [a0, a1, a2, a3, a4, a5, a6, a7, ...]
#!
#! Final stack should look like
#!
#! [a0', a1', a2', a3', a4', a5', a6', a7', ...]
#!
#! See section 2.2 of https://eprint.iacr.org/2017/1057.pdf
#! See https://github.com/itzmeanjan/secp256k1/blob/6e5e654823a073add7d62b21ed88e9de9bb06869/field/base_field_utils.py#L235-L241
#! for implementation
export.from_mont
  push.0.0.0.0
  push.0.0.0.1 # pushed 1's radix-2^32 form;

  exec.mul
end

#! Given an element ( say a ) of secp256k1 base field, this routine computes multiplicative
#! inverse ( say a' ) of that element s.t. a * a' = 1 ( mod p ) | p = secp256k1 base field prime
#!
#! Expected stack state
#!
#! [a0, a1, a2, a3, a4, a5, a6, a7, ...] | a[0..8] is a 256 -bit number
#!
#! Final stack state
#!
#! [b0, b1, b2, b3, b4, b5, b6, b7, ...] | b[0..8] is a 256 -bit number s.t. b = a^-1 ( mod p )
#!
#! Note, both input and output stays in Montgomery form. If 0 is input operand, then multiplicative
#! inverse can't be computed, which is why output result is also 0.
#!
#! See https://github.com/itzmeanjan/secp256k1/blob/37b339db3e03d24c2977399eb8896ef515ebb09b/field/base_field.py#L114-L132
export.inv.4
  # cache result initial value ( = 1, in Montgomery form )
  push.0.0.0.0.0.0.1.977
  loc_storew.0
  dropw
  loc_storew.1
  dropw

  # cache base
  loc_storew.2
  dropw
  loc_storew.3
  dropw

  push.4294966317.4294967294.4294967295.4294967295.4294967295.4294967295.4294967295.4294967295

  repeat.8
    repeat.32
      push.0.0.0.0.0.0.0.0
      loc_loadw.1
      swapw
      loc_loadw.0

      exec.sqr

      loc_storew.0
      dropw
      loc_storew.1
      dropw

      dup
      u32shr.31
      if.true
        push.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0

        loc_loadw.3
        swapw
        loc_loadw.2

        swapdw

        loc_loadw.1
        swapw
        loc_loadw.0

        exec.mul

        loc_storew.0
        dropw
        loc_storew.1
        dropw
      end

      u32shl.1
    end

    drop
  end

  push.0.0.0.0.0.0.0.0
  loc_loadw.1
  swapw
  loc_loadw.0
end