#! Initializes four memory addresses, provided for storing initial 4x4 blake3 #! state matrix ( i.e. 16 elements each of 32 -bit ), for computing blake3 2-to-1 hash #! #! Expected stack state: #! #! [state_0_3_addr, state_4_7_addr, state_8_11_addr, state_12_15_addr] #! #! Note, state_`i`_`j`_addr -> absolute address of {state[i], state[i+1], state[i+2], state[i+3]} in memory | j = i+3 #! #! Final stack state: #! #! [...] #! #! Initialized stack state is written back to provided memory addresses. #! #! Functionally this routine is equivalent to https://github.com/itzmeanjan/blake3/blob/f07d32e/include/blake3.hpp#!L1709-L1713 proc.initialize_2to1 push.0xA54FF53A.0x3C6EF372.0xBB67AE85.0x6A09E667 movup.4 mem_storew movup.5 mem_storew dropw push.0x5BE0CD19.0x1F83D9AB.0x9B05688C.0x510E527F movup.4 mem_storew dropw push.11.64.0.0 movup.4 mem_storew dropw end #! Initializes four memory addresses, provided for storing initial 4x4 blake3 #! state matrix ( i.e. 16 elements each of 32 -bit ), for computing blake3 1-to-1 hash #! #! Expected stack state: #! #! [state_0_3_addr, state_4_7_addr, state_8_11_addr, state_12_15_addr] #! #! Note, state_`i`_`j`_addr -> absolute address of {state[i], state[i+1], state[i+2], state[i+3]} in memory | j = i+3 #! #! Final stack state: #! #! [...] #! #! Initialized stack state is written back to provided memory addresses. #! #! Functionally this routine is equivalent to https://github.com/itzmeanjan/blake3/blob/f07d32e/include/blake3.hpp#!L1709-L1713 #! with only difference being value of BLOCK_LEN = 32 proc.initialize_1to1 push.0xA54FF53A.0x3C6EF372.0xBB67AE85.0x6A09E667 movup.4 mem_storew movup.5 mem_storew dropw push.0x5BE0CD19.0x1F83D9AB.0x9B05688C.0x510E527F movup.4 mem_storew dropw push.11.32.0.0 movup.4 mem_storew dropw end #! Permutes ordered message words, kept on stack top ( = sixteen 32 -bit BLAKE3 words ) #! #! Expected stack top: #! #! [s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15] #! #! After permutation, stack top: #! #! [s2, s6, s3, s10, s7, s0, s4, s13, s1, s11, s12, s5, s9, s14, s15, s8] #! #! See https://github.com/itzmeanjan/blake3/blob/f07d32ec10cbc8a10663b7e6539e0b1dab3e453b/include/blake3.hpp#!L1623-L1639 #! and https://github.com/0xPolygonMiden/miden-vm/pull/313#!discussion_r922627984 proc.permute_msg_words movdn.7 movup.5 movdn.2 movup.4 movdn.7 swapw.3 swap movdn.7 swapdw movup.2 movdn.7 swapw swapw.2 movup.3 movdn.6 movdn.5 movup.3 swapw movup.3 swapdw end #! Given blake3 state matrix on stack top ( in order ) as 16 elements ( each of 32 -bit ), #! this routine computes output chaining value i.e. 2-to-1 hashing digest. #! #! Expected stack state: #! #! [state0, state1, state2, state3, state4, state5, state6, state7, state8, state9, state10, state11, state12, state13, state14, state15] #! #! After finalizing, stack should look like #! #! [dig0, dig1, dig2, dig3, dig4, dig5, dig6, dig7] #! #! See https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#!L116-L119 , #! you'll notice I've skipped executing second statement in loop body of above hyperlinked implementation, #! that's because it doesn't dictate what output of 2-to-1 hash will be. proc.finalize movup.8 u32xor swap movup.8 u32xor swap movup.2 movup.8 u32xor movdn.2 movup.3 movup.8 u32xor movdn.3 movup.4 movup.8 u32xor movdn.4 movup.5 movup.8 u32xor movdn.5 movup.6 movup.8 u32xor movdn.6 movup.7 movup.8 u32xor movdn.7 end #! Given blake3 state matrix ( total 16 elements, each of 32 -bit ) and #! 8 message words ( each of 32 -bit ), this routine performs column-wise mixing #! of message words into blake3 hash state. #! #! Functionality wise this routine is equivalent to https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#!L55-L59 #! #! Expected stack state: #! #! [state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr, m0, m1, m2, m3, m4, m5, m6, m7] #! #! Note, state_`i`_`j`_addr -> absolute address of {state[i], state[i+1], state[i+2], state[i+3]} in memory | j = i+3 #! #! Meaning four consecutive blake3 state words can be read from memory easily. #! #! Final stack state: #! #! [state0, state1, state2, state3, state4, state5, state6, state7, state8, state9, state10, state11, state12, state13, state14, state15] #! #! i.e. whole blake3 state is placed on stack ( in order ). proc.columnar_mixing.1 swapw.2 swapw movup.7 movup.6 movup.5 movup.4 loc_storew.0 movup.9 mem_loadw movup.8 push.0.0.0.0 movup.4 mem_loadw movup.8 dup.5 u32overflowing_add3 drop swap movup.8 dup.6 u32overflowing_add3 drop swap movup.2 dup.6 movup.9 u32overflowing_add3 drop movdn.2 movup.3 dup.7 movup.9 u32overflowing_add3 drop movdn.3 movup.9 push.0.0.0.0 movup.4 mem_loadw dup.4 u32xor u32rotr.16 swap dup.5 u32xor u32rotr.16 swap movup.2 dup.6 u32xor u32rotr.16 movdn.2 movup.3 dup.7 u32xor u32rotr.16 movdn.3 movup.12 push.0.0.0.0 movup.4 mem_loadw dup.4 u32wrapping_add swap dup.5 u32wrapping_add swap movup.2 dup.6 u32wrapping_add movdn.2 movup.3 dup.7 u32wrapping_add movdn.3 movupw.3 dup.4 u32xor u32rotr.12 swap dup.5 u32xor u32rotr.12 swap movup.2 dup.6 u32xor u32rotr.12 movdn.2 movup.3 dup.7 u32xor u32rotr.12 movdn.3 movupw.3 push.0.0.0.0 loc_loadw.0 swapw movup.4 dup.8 u32overflowing_add3 drop swap movup.4 dup.8 u32overflowing_add3 drop swap movup.2 movup.4 dup.8 u32overflowing_add3 drop movdn.2 movup.3 movup.4 dup.8 u32overflowing_add3 drop movdn.3 movupw.3 dup.4 u32xor u32rotr.8 swap dup.5 u32xor u32rotr.8 swap movup.2 dup.6 u32xor u32rotr.8 movdn.2 movup.3 dup.7 u32xor u32rotr.8 movdn.3 movupw.3 dup.4 u32wrapping_add swap dup.5 u32wrapping_add swap movup.2 dup.6 u32wrapping_add movdn.2 movup.3 dup.7 u32wrapping_add movdn.3 movupw.3 dup.4 u32xor u32rotr.7 swap dup.5 u32xor u32rotr.7 swap movup.2 dup.6 u32xor u32rotr.7 movdn.2 movup.3 dup.7 u32xor u32rotr.7 movdn.3 movupw.3 end #! Given blake3 state matrix ( total 16 elements, each of 32 -bit ) and #! 8 message words ( each of 32 -bit ), this routine performs diagonal-wise mixing #! of message words into blake3 hash state. #! #! Functionality wise this routine is equivalent to https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#!L61-L64 #! #! Expected stack state: #! #! [state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr, m0, m1, m2, m3, m4, m5, m6, m7] #! #! Note, state_`i`_`j`_addr -> absolute address of {state[i], state[i+1], state[i+2], state[i+3]} in memory | j = i+3 #! #! Meaning four consecutive blake3 state words can be read from memory easily. #! #! Final stack state: #! #! [state0, state1, state2, state3, state4, state5, state6, state7, state8, state9, state10, state11, state12, state13, state14, state15] #! #! i.e. whole blake3 state is placed on stack ( in order ). proc.diagonal_mixing.1 swapw.2 swapw movup.7 movup.6 movup.5 movup.4 loc_storew.0 movup.9 mem_loadw movup.8 push.0.0.0.0 movup.4 mem_loadw movup.8 dup.6 u32overflowing_add3 drop swap movup.8 dup.7 u32overflowing_add3 drop swap movup.2 movup.8 dup.8 u32overflowing_add3 drop movdn.2 movup.3 movup.8 dup.5 u32overflowing_add3 drop movdn.3 movup.9 push.0.0.0.0 movup.4 mem_loadw movup.3 dup.4 u32xor u32rotr.16 movdn.3 dup.5 u32xor u32rotr.16 swap dup.6 u32xor u32rotr.16 swap movup.2 dup.7 u32xor u32rotr.16 movdn.2 movup.12 push.0.0.0.0 movup.4 mem_loadw movup.2 dup.7 u32wrapping_add movdn.2 movup.3 dup.4 u32wrapping_add movdn.3 dup.5 u32wrapping_add swap dup.6 u32wrapping_add swap movupw.3 swap dup.6 u32xor u32rotr.12 swap movup.2 dup.7 u32xor u32rotr.12 movdn.2 movup.3 dup.4 u32xor u32rotr.12 movdn.3 dup.5 u32xor u32rotr.12 movupw.3 push.0.0.0.0 loc_loadw.0 swapw movup.4 dup.9 u32overflowing_add3 drop swap movup.4 dup.9 u32overflowing_add3 drop swap movup.2 movup.4 dup.9 u32overflowing_add3 drop movdn.2 movup.3 movup.4 dup.5 u32overflowing_add3 drop movdn.3 movupw.3 movup.3 dup.4 u32xor u32rotr.8 movdn.3 dup.5 u32xor u32rotr.8 swap dup.6 u32xor u32rotr.8 swap movup.2 dup.7 u32xor u32rotr.8 movdn.2 movupw.3 movup.2 dup.7 u32wrapping_add movdn.2 movup.3 dup.4 u32wrapping_add movdn.3 dup.5 u32wrapping_add swap dup.6 u32wrapping_add swap movupw.3 swap dup.6 u32xor u32rotr.7 swap movup.2 dup.7 u32xor u32rotr.7 movdn.2 movup.3 dup.4 u32xor u32rotr.7 movdn.3 dup.5 u32xor u32rotr.7 movupw.3 end #! Given blake3 state matrix ( total 16 elements, each of 32 -bit ) and #! 16 message words ( each of 32 -bit ), this routine applies single round of mixing #! of message words into hash state i.e. msg_word[0..8] are mixed into hash state using #! columnar mixing while remaining message words ( msg_word[8..16] ) are mixed into hash state #! using diagonal mixing. #! #! Functionality wise this routine is equivalent to https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#!L54-L65 #! #! Expected stack state: #! #! [state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15] #! #! Note, state_`i`_`j`_addr -> absolute address of {state[i], state[i+1], state[i+2], state[i+3]} in memory | j = i+3 #! #! Meaning four consecutive blake3 state words can be read from memory easily. #! #! Final stack state: #! #! [...] #! #! i.e. mixed state matrix lives in memory addresses {state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr}, #! which were provided, on stack top, while invoking this routine. proc.round.5 loc_storew.0 exec.columnar_mixing loc_storew.1 dropw loc_storew.2 dropw loc_storew.3 dropw loc_storew.4 dropw locaddr.4 locaddr.3 locaddr.2 locaddr.1 exec.diagonal_mixing push.0.0.0.0 loc_loadw.0 swapw movup.4 mem_storew dropw repeat.3 push.0 movdn.3 swapw movup.4 mem_storew dropw end repeat.3 drop end end #! Given blake3 state matrix ( total 16 elements, each of 32 -bit ) and a message block #! i.e. 16 message words ( each of 32 -bit ), this routine applies 7 rounds of mixing #! of (permuted) message words into hash state. #! #! Functionality wise this routine is equivalent to https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#!L75-L114 #! #! Expected stack state: #! #! [state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr, m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15] #! #! Note, state_`i`_`j`_addr -> absolute address of {state[i], state[i+1], state[i+2], state[i+3]} in memory | j = i+3 #! #! Meaning four consecutive blake3 state words can be read from memory easily. #! #! Final stack state: #! #! [...] #! #! i.e. 7 -round mixed state matrix lives in memory addresses {state0_3_addr, state4_7_addr, state8_11_addr, state12_15_addr}, #! which were provided, on stack top, while invoking this routine. So updated state matrix can be read by caller routine, by reading #! the content of memory addresses where state was provided as routine input. proc.compress.1 loc_storew.0 dropw # apply first 6 rounds of mixing repeat.6 # round `i` | i ∈ [1..7) repeat.4 dupw.3 end push.0.0.0.0 loc_loadw.0 exec.round exec.permute_msg_words end # round 7 ( last round, so no message word permutation required ) push.0.0.0.0 loc_loadw.0 exec.round end #! Blake3 2-to-1 hash function, which takes 64 -bytes input and produces 32 -bytes output digest #! #! Expected stack state: #! #! [msg0, msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8, msg9, msg10, msg11, msg12, msg13, msg14, msg15, ...] #! #! msg`i` -> 32 -bit message word | i ∈ [0, 16) #! #! Final stack state: #! #! [dig0, dig1, dig2, dig3, dig4, dig5, dig6, dig7, ...] #! #! dig`i` -> 32 -bit digest word | i ∈ [0, 8) export.hash_2to1.4 locaddr.3 locaddr.2 locaddr.1 locaddr.0 exec.initialize_2to1 # Note, chunk compression routine needs to compress only one chunk with one message # block ( = 64 -bytes ) because what we're doing here is 2-to-1 hashing i.e. 64 -bytes # input being converted to 32 -bytes output locaddr.3 locaddr.2 locaddr.1 locaddr.0 exec.compress push.0.0.0.0 loc_loadw.3 push.0.0.0.0 loc_loadw.2 push.0.0.0.0 loc_loadw.1 push.0.0.0.0 loc_loadw.0 exec.finalize end #! Blake3 1-to-1 hash function, which takes 32 -bytes input and produces 32 -bytes output digest #! #! Expected stack state: #! #! [msg0, msg1, msg2, msg3, msg4, msg5, msg6, msg7, ...] #! #! msg`i` -> 32 -bit message word | i ∈ [0, 8) #! #! Final stack state: #! #! [dig0, dig1, dig2, dig3, dig4, dig5, dig6, dig7, ...] #! #! dig`i` -> 32 -bit digest word | i ∈ [0, 8) export.hash_1to1.4 # Pad 32 -bytes input message with zero bytes to make # 64 -bytes, which is processed same as 2-to-1 hashing push.0.0.0.0.0.0.0.0 swapdw locaddr.3 locaddr.2 locaddr.1 locaddr.0 exec.initialize_1to1 # Note, chunk compression routine needs to compress only one chunk with one message # block ( = 64 -bytes ), which is obtained by padding 32 -bytes input. locaddr.3 locaddr.2 locaddr.1 locaddr.0 exec.compress push.0.0.0.0 loc_loadw.3 push.0.0.0.0 loc_loadw.2 push.0.0.0.0 loc_loadw.1 push.0.0.0.0 loc_loadw.0 exec.finalize end