#! Initializes four memory addresses, provided for storing initial 4x4 blake3
#! state matrix ( i.e. 16 elements each of 32 -bit ), for computing blake3 2-to-1 hash
#!
#! Expected stack state:
#!
#! [state_0_3_addr, state_4_7_addr, state_8_11_addr, state_12_15_addr]
#!
#! Note, state_`i`_`j`_addr -> absolute address of {state[i], state[i+1], state[i+2], state[i+3]} in memory | j = i+3
#!
#! Final stack state:
#!
#! [...]
#!
#! Initialized stack state is written back to provided memory addresses.
#!
#! Functionally this routine is equivalent to https://github.com/itzmeanjan/blake3/blob/f07d32e/include/blake3.hpp#!L1709-L1713
proc.initialize_2to1
    push.0xA54FF53A.0x3C6EF372.0xBB67AE85.0x6A09E667
    movup.4
    mem_storew
    movup.5
    mem_storew
    dropw

    push.0x5BE0CD19.0x1F83D9AB.0x9B05688C.0x510E527F
    movup.4
    mem_storew
    dropw

    push.11.64.0.0
    movup.4
    mem_storew
    dropw
end

#! Initializes four memory addresses, provided for storing initial 4x4 blake3
#! state matrix ( i.e. 16 elements each of 32 -bit ), for computing blake3 1-to-1 hash
#!
#! Expected stack state:
#!
#! [state_0_3_addr, state_4_7_addr, state_8_11_addr, state_12_15_addr]
#!
#! Note, state_`i`_`j`_addr -> absolute address of {state[i], state[i+1], state[i+2], state[i+3]} in memory | j = i+3
#!
#! Final stack state:
#!
#! [...]
#!
#! Initialized stack state is written back to provided memory addresses.
#!
#! Functionally this routine is equivalent to https://github.com/itzmeanjan/blake3/blob/f07d32e/include/blake3.hpp#!L1709-L1713
#! with only difference being value of BLOCK_LEN = 32
proc.initialize_1to1
    push.0xA54FF53A.0x3C6EF372.0xBB67AE85.0x6A09E667
    movup.4
    mem_storew
    movup.5
    mem_storew
    dropw

    push.0x5BE0CD19.0x1F83D9AB.0x9B05688C.0x510E527F
    movup.4
    mem_storew
    dropw

    push.11.32.0.0
    movup.4
    mem_storew
    dropw
end

#! Permutes ordered message words, kept on stack top ( = sixteen 32 -bit BLAKE3 words )
#!
#! Expected stack top:
#!
#! [s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15]
#!
#! After permutation, stack top:
#!
#! [s2, s6, s3, s10, s7, s0, s4, s13, s1, s11, s12, s5, s9, s14, s15, s8]
#!
#! See https://github.com/itzmeanjan/blake3/blob/f07d32ec10cbc8a10663b7e6539e0b1dab3e453b/include/blake3.hpp#!L1623-L1639
#! and https://github.com/0xPolygonMiden/miden-vm/pull/313#!discussion_r922627984
proc.permute_msg_words
    movdn.7
    movup.5
    movdn.2
    movup.4
    movdn.7
    swapw.3
    swap
    movdn.7
    swapdw
    movup.2
    movdn.7
    swapw
    swapw.2
    movup.3
    movdn.6
    movdn.5
    movup.3
    swapw
    movup.3
    swapdw
end

#! Given blake3 state matrix on stack top ( in order ) as 16 elements ( each of 32 -bit ),
#! this routine computes output chaining value i.e. 2-to-1 hashing digest.
#!
#! Expected stack state:
#!
#! [state0, state1, state2, state3, state4, state5, state6, state7, state8, state9, state10, state11, state12, state13, state14, state15]
#!
#! After finalizing, stack should look like
#!
#! [dig0, dig1, dig2, dig3, dig4, dig5, dig6, dig7]
#!
#! See https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#!L116-L119 ,
#! you'll notice I've skipped executing second statement in loop body of above hyperlinked implementation,
#! that's because it doesn't dictate what output of 2-to-1 hash will be.
proc.finalize
    movup.8
    u32xor

    swap
    movup.8
    u32xor
    swap

    movup.2
    movup.8
    u32xor
    movdn.2

    movup.3
    movup.8
    u32xor
    movdn.3

    movup.4
    movup.8
    u32xor
    movdn.4

    movup.5
    movup.8
    u32xor
    movdn.5

    movup.6
    movup.8
    u32xor
    movdn.6

    movup.7
    movup.8
    u32xor
    movdn.7
end

#! Given blake3 state matrix ( total 16 elements, each of 32 -bit ) and
#! 8 message words ( each of 32 -bit ), this routine performs column-wise mixing
#! of message words into blake3 hash state.
#!
#! Functionality wise this routine is equivalent to https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#!L55-L59
#!
#! Expected stack state:
#!
#! [state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr, m0, m1, m2, m3, m4, m5, m6, m7]
#!
#! Note, state_`i`_`j`_addr -> absolute address of {state[i], state[i+1], state[i+2], state[i+3]} in memory | j = i+3
#!
#! Meaning four consecutive blake3 state words can be read from memory easily.
#!
#! Final stack state:
#!
#! [state0, state1, state2, state3, state4, state5, state6, state7, state8, state9, state10, state11, state12, state13, state14, state15]
#!
#! i.e. whole blake3 state is placed on stack ( in order ).
proc.columnar_mixing.1
    swapw.2
    swapw

    movup.7
    movup.6
    movup.5
    movup.4

    loc_storew.0

    movup.9
    mem_loadw
    movup.8
    push.0.0.0.0
    movup.4
    mem_loadw

    movup.8
    dup.5
    u32overflowing_add3
    drop

    swap
    movup.8
    dup.6
    u32overflowing_add3
    drop
    swap

    movup.2
    dup.6
    movup.9
    u32overflowing_add3
    drop
    movdn.2

    movup.3
    dup.7
    movup.9
    u32overflowing_add3
    drop
    movdn.3

    movup.9
    push.0.0.0.0
    movup.4
    mem_loadw

    dup.4
    u32xor
    u32rotr.16

    swap
    dup.5
    u32xor
    u32rotr.16
    swap

    movup.2
    dup.6
    u32xor
    u32rotr.16
    movdn.2

    movup.3
    dup.7
    u32xor
    u32rotr.16
    movdn.3

    movup.12
    push.0.0.0.0
    movup.4
    mem_loadw

    dup.4
    u32wrapping_add

    swap
    dup.5
    u32wrapping_add
    swap

    movup.2
    dup.6
    u32wrapping_add
    movdn.2

    movup.3
    dup.7
    u32wrapping_add
    movdn.3

    movupw.3

    dup.4
    u32xor
    u32rotr.12

    swap
    dup.5
    u32xor
    u32rotr.12
    swap

    movup.2
    dup.6
    u32xor
    u32rotr.12
    movdn.2

    movup.3
    dup.7
    u32xor
    u32rotr.12
    movdn.3

    movupw.3
    push.0.0.0.0
    loc_loadw.0
    swapw

    movup.4
    dup.8
    u32overflowing_add3
    drop

    swap
    movup.4
    dup.8
    u32overflowing_add3
    drop
    swap

    movup.2
    movup.4
    dup.8
    u32overflowing_add3
    drop
    movdn.2

    movup.3
    movup.4
    dup.8
    u32overflowing_add3
    drop
    movdn.3

    movupw.3

    dup.4
    u32xor
    u32rotr.8

    swap
    dup.5
    u32xor
    u32rotr.8
    swap

    movup.2
    dup.6
    u32xor
    u32rotr.8
    movdn.2

    movup.3
    dup.7
    u32xor
    u32rotr.8
    movdn.3

    movupw.3

    dup.4
    u32wrapping_add

    swap
    dup.5
    u32wrapping_add
    swap

    movup.2
    dup.6
    u32wrapping_add
    movdn.2

    movup.3
    dup.7
    u32wrapping_add
    movdn.3

    movupw.3

    dup.4
    u32xor
    u32rotr.7

    swap
    dup.5
    u32xor
    u32rotr.7
    swap

    movup.2
    dup.6
    u32xor
    u32rotr.7
    movdn.2

    movup.3
    dup.7
    u32xor
    u32rotr.7
    movdn.3

    movupw.3
end

#! Given blake3 state matrix ( total 16 elements, each of 32 -bit ) and
#! 8 message words ( each of 32 -bit ), this routine performs diagonal-wise mixing
#! of message words into blake3 hash state.
#!
#! Functionality wise this routine is equivalent to https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#!L61-L64
#!
#! Expected stack state:
#!
#! [state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr, m0, m1, m2, m3, m4, m5, m6, m7]
#!
#! Note, state_`i`_`j`_addr -> absolute address of {state[i], state[i+1], state[i+2], state[i+3]} in memory | j = i+3
#!
#! Meaning four consecutive blake3 state words can be read from memory easily.
#!
#! Final stack state:
#!
#! [state0, state1, state2, state3, state4, state5, state6, state7, state8, state9, state10, state11, state12, state13, state14, state15]
#!
#! i.e. whole blake3 state is placed on stack ( in order ).
proc.diagonal_mixing.1
    swapw.2
    swapw

    movup.7
    movup.6
    movup.5
    movup.4

    loc_storew.0

    movup.9
    mem_loadw
    movup.8
    push.0.0.0.0
    movup.4
    mem_loadw

    movup.8
    dup.6
    u32overflowing_add3
    drop

    swap
    movup.8
    dup.7
    u32overflowing_add3
    drop
    swap

    movup.2
    movup.8
    dup.8
    u32overflowing_add3
    drop
    movdn.2

    movup.3
    movup.8
    dup.5
    u32overflowing_add3
    drop
    movdn.3

    movup.9
    push.0.0.0.0
    movup.4
    mem_loadw

    movup.3
    dup.4
    u32xor
    u32rotr.16
    movdn.3

    dup.5
    u32xor
    u32rotr.16

    swap
    dup.6
    u32xor
    u32rotr.16
    swap

    movup.2
    dup.7
    u32xor
    u32rotr.16
    movdn.2

    movup.12
    push.0.0.0.0
    movup.4
    mem_loadw

    movup.2
    dup.7
    u32wrapping_add
    movdn.2

    movup.3
    dup.4
    u32wrapping_add
    movdn.3

    dup.5
    u32wrapping_add

    swap
    dup.6
    u32wrapping_add
    swap

    movupw.3

    swap
    dup.6
    u32xor
    u32rotr.12
    swap

    movup.2
    dup.7
    u32xor
    u32rotr.12
    movdn.2

    movup.3
    dup.4
    u32xor
    u32rotr.12
    movdn.3

    dup.5
    u32xor
    u32rotr.12

    movupw.3
    push.0.0.0.0
    loc_loadw.0
    swapw

    movup.4
    dup.9
    u32overflowing_add3
    drop

    swap
    movup.4
    dup.9
    u32overflowing_add3
    drop
    swap

    movup.2
    movup.4
    dup.9
    u32overflowing_add3
    drop
    movdn.2

    movup.3
    movup.4
    dup.5
    u32overflowing_add3
    drop
    movdn.3

    movupw.3

    movup.3
    dup.4
    u32xor
    u32rotr.8
    movdn.3

    dup.5
    u32xor
    u32rotr.8

    swap
    dup.6
    u32xor
    u32rotr.8
    swap

    movup.2
    dup.7
    u32xor
    u32rotr.8
    movdn.2

    movupw.3

    movup.2
    dup.7
    u32wrapping_add
    movdn.2

    movup.3
    dup.4
    u32wrapping_add
    movdn.3

    dup.5
    u32wrapping_add

    swap
    dup.6
    u32wrapping_add
    swap

    movupw.3

    swap
    dup.6
    u32xor
    u32rotr.7
    swap

    movup.2
    dup.7
    u32xor
    u32rotr.7
    movdn.2

    movup.3
    dup.4
    u32xor
    u32rotr.7
    movdn.3

    dup.5
    u32xor
    u32rotr.7

    movupw.3
end

#! Given blake3 state matrix ( total 16 elements, each of 32 -bit ) and
#! 16 message words ( each of 32 -bit ), this routine applies single round of mixing
#! of message words into hash state i.e. msg_word[0..8] are mixed into hash state using
#! columnar mixing while remaining message words ( msg_word[8..16] ) are mixed into hash state
#! using diagonal mixing.
#!
#! Functionality wise this routine is equivalent to https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#!L54-L65
#!
#! Expected stack state:
#!
#! [state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15]
#!
#! Note, state_`i`_`j`_addr -> absolute address of {state[i], state[i+1], state[i+2], state[i+3]} in memory | j = i+3
#!
#! Meaning four consecutive blake3 state words can be read from memory easily.
#!
#! Final stack state:
#!
#! [...]
#!
#! i.e. mixed state matrix lives in memory addresses {state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr},
#! which were provided, on stack top, while invoking this routine.
proc.round.5
    loc_storew.0

    exec.columnar_mixing

    loc_storew.1
    dropw
    loc_storew.2
    dropw
    loc_storew.3
    dropw
    loc_storew.4
    dropw

    locaddr.4
    locaddr.3
    locaddr.2
    locaddr.1

    exec.diagonal_mixing

    push.0.0.0.0
    loc_loadw.0
    swapw
    movup.4
    mem_storew
    dropw

    repeat.3
        push.0
        movdn.3
        swapw
        movup.4
        mem_storew
        dropw
    end

    repeat.3
        drop
    end
end

#! Given blake3 state matrix ( total 16 elements, each of 32 -bit ) and a message block
#! i.e. 16 message words ( each of 32 -bit ), this routine applies 7 rounds of mixing
#! of (permuted) message words into hash state.
#!
#! Functionality wise this routine is equivalent to https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#!L75-L114
#!
#! Expected stack state:
#!
#! [state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15]
#!
#! Note, state_`i`_`j`_addr -> absolute address of {state[i], state[i+1], state[i+2], state[i+3]} in memory | j = i+3
#!
#! Meaning four consecutive blake3 state words can be read from memory easily.
#!
#! Final stack state:
#!
#! [...]
#!
#! i.e. 7 -round mixed state matrix lives in memory addresses {state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr},
#! which were provided, on stack top, while invoking this routine. So updated state matrix can be read by caller routine, by reading
#! the content of memory addresses where state was provided as routine input.
proc.compress.1
    loc_storew.0
    dropw

    # apply first 6 rounds of mixing
    repeat.6
        # round `i` | i ∈ [1..7)
        repeat.4
            dupw.3
        end

        push.0.0.0.0
        loc_loadw.0
        exec.round
        exec.permute_msg_words
    end

    # round 7 ( last round, so no message word permutation required )
    push.0.0.0.0
    loc_loadw.0
    exec.round
end

#! Blake3 2-to-1 hash function, which takes 64 -bytes input and produces 32 -bytes output digest
#!
#! Expected stack state:
#!
#! [msg0, msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8, msg9, msg10, msg11, msg12, msg13, msg14, msg15, ...]
#!
#! msg`i` -> 32 -bit message word | i ∈ [0, 16)
#!
#! Final stack state:
#!
#! [dig0, dig1, dig2, dig3, dig4, dig5, dig6, dig7, ...]
#!
#! dig`i` -> 32 -bit digest word | i ∈ [0, 8)
export.hash_2to1.4
    locaddr.3
    locaddr.2
    locaddr.1
    locaddr.0

    exec.initialize_2to1

    # Note, chunk compression routine needs to compress only one chunk with one message
    # block ( = 64 -bytes ) because what we're doing here is 2-to-1 hashing i.e. 64 -bytes
    # input being converted to 32 -bytes output

    locaddr.3
    locaddr.2
    locaddr.1
    locaddr.0

    exec.compress

    push.0.0.0.0
    loc_loadw.3
    push.0.0.0.0
    loc_loadw.2
    push.0.0.0.0
    loc_loadw.1
    push.0.0.0.0
    loc_loadw.0

    exec.finalize
end

#! Blake3 1-to-1 hash function, which takes 32 -bytes input and produces 32 -bytes output digest
#!
#! Expected stack state:
#!
#! [msg0, msg1, msg2, msg3, msg4, msg5, msg6, msg7, ...]
#!
#! msg`i` -> 32 -bit message word | i ∈ [0, 8)
#!
#! Final stack state:
#!
#! [dig0, dig1, dig2, dig3, dig4, dig5, dig6, dig7, ...]
#!
#! dig`i` -> 32 -bit digest word | i ∈ [0, 8)
export.hash_1to1.4
    # Pad 32 -bytes input message with zero bytes to make
    # 64 -bytes, which is processed same as 2-to-1 hashing
    push.0.0.0.0.0.0.0.0
    swapdw

    locaddr.3
    locaddr.2
    locaddr.1
    locaddr.0

    exec.initialize_1to1

    # Note, chunk compression routine needs to compress only one chunk with one message
    # block ( = 64 -bytes ), which is obtained by padding 32 -bytes input.

    locaddr.3
    locaddr.2
    locaddr.1
    locaddr.0

    exec.compress

    push.0.0.0.0
    loc_loadw.3
    push.0.0.0.0
    loc_loadw.2
    push.0.0.0.0
    loc_loadw.1
    push.0.0.0.0
    loc_loadw.0

    exec.finalize
end