u256 g_r16 = (32u8)[13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2, 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2]; u256 g_r8 = (32u8)[14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3, 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3]; u256 g_cnt = (8u32)[7,6,5,4,3,2,1,0]; u256 g_cnt_inc = (8u32)[8,8,8,8,8,8,8,8]; u256 g_p1 = (2u128)[1,0]; u256 g_p2 = (2u128)[2,2]; u128 g_sigma = 0x6b20657479622d323320646e61707865; u32 g_sigma0 = 0x61707865; u32 g_sigma1 = 0x3320646e; u32 g_sigma2 = 0x79622d32; u32 g_sigma3 = 0x6b206574; u128 g_p0 = 0; param int i_0 = 0; param int i_4 = 4; param int i_32 = 32; fn load_shufb_cmd() -> stack u256, stack u256 { reg u256 r16, r8; stack u256 s_r16, s_r8; r16 = g_r16; r8 = g_r8; s_r16 = r16; s_r8 = r8; return s_r16, s_r8; } // init fn init_x2(reg u64 key nonce, reg u32 counter) -> reg u256[4] { reg u256[4] st; reg u128 nc; stack u128 s_nc; nc = g_p0; nc = #x86_VPINSR_4u32(nc, counter, 0); nc = #x86_VPINSR_4u32(nc, (u32)[nonce + 0], 1); nc = #x86_VPINSR_2u64(nc, (u64)[nonce + 4], 1); s_nc = nc; st[0] = #x86_VPBROADCAST_2u128(g_sigma); st[1] = #x86_VPBROADCAST_2u128((u128)[key + 0]); st[2] = #x86_VPBROADCAST_2u128((u128)[key + 16]); st[3] = #x86_VPBROADCAST_2u128(s_nc); st[3] +8u32= g_p1; // st // 0 { sigma , sigma } // 1 { k[127:0] , k[127:0] } // 2 { k[255:128], k[255:128] } // 3 { n , cnt+1 , n , cnt } return st; } fn init_x8(reg u64 key nonce, reg u32 counter) -> stack u256[16] { inline int i; stack u256[16] st_; reg u256[16] st; stack u32 s_counter; s_counter = counter; st[0] = #x86_VPBROADCAST_8u32(g_sigma0); st[1] = #x86_VPBROADCAST_8u32(g_sigma1); st[2] = #x86_VPBROADCAST_8u32(g_sigma2); st[3] = #x86_VPBROADCAST_8u32(g_sigma3); for i=0 to 8 { st[i+4] = #x86_VPBROADCAST_8u32((u32)[key + i*4]); } st[12] = #x86_VPBROADCAST_8u32(s_counter); st[12] +8u32= g_cnt; for i=0 to 3 { st[i+13] = #x86_VPBROADCAST_8u32((u32)[nonce + i*4]); } // st // 0 { ... , sigma0 , sigma0 } // 1 { ... , sigma1 , sigma1 } // 2 { ... , sigma2 , sigma2 } // 3 { ... , sigma3 , sigma3 } // 4 { ... , k[31:0] , k[31:0] } // ... // 11 { ... , k[255:224] , k[255:224] } // 12 { ... , ctr+1 , ctr } // 13 { ... , n[31:0] , n[31:0] } // ... // 15 { ... , n[95:64] , n[95:64] } st_ = st; return st_; } // copy state fn copy_state_x2(reg u256[4] st) -> reg u256[4] { reg u256[4] k; k = st; return k; } fn copy_state_x4(reg u256[4] st) -> reg u256[4], reg u256[4] { reg u256[4] k1 k2; k1 = st; k2 = st; k2[3] +8u32= g_p2; // k2 k1 // { sigma , sigma } { sigma , sigma } // { k[127:0] , k[127:0] } { k[127:0] , k[127:0] } // { k[255:128], k[255:128] } { k[255:128], k[255:128] } // { n , cnt+3 , n , cnt+2 } { n , cnt+1 , n , cnt } return k1, k2; } fn copy_state_x8(stack u256[16] st) -> reg u256[16] { reg u256[16] k; k = st; return k; } // sum states fn sum_states_x2(reg u256[4] k st) -> reg u256[4] { inline int i; for i=0 to 4 { k[i] +8u32= st[i]; } return k; } fn sum_states_x4(reg u256[4] k1 k2 st) -> reg u256[4], reg u256[4] { k1 = sum_states_x2(k1, st); k2 = sum_states_x2(k2, st); k2[3] +8u32= g_p2; return k1, k2; } fn sum_states_x8(reg u256[16] k, stack u256[16] st) -> reg u256[16] { inline int i; for i=0 to 16 { k[i] +8u32= st[i]; } return k; } // increment fn increment_counter_x8(stack u256[16] s) -> stack u256[16] { reg u256 t; t = g_cnt_inc; t +8u32= s[12]; s[12] = t; return s; } // store auxiliary functions fn update_ptr(reg u64 output plain, reg u32 len, inline int n) -> reg u64, reg u64, reg u32 { output += n; plain += n; len -= n; return output, plain, len; } // function perm_x2 receives a state k organized as follows: // // k[0] = { k2[3], k2[2], k2[1], k2[0], k1[3], k1[2], k1[1], k1[0] } // k[1] = { k2[7], k2[6], k2[5], k2[4], k1[7], k1[6], k1[5], k1[4] } // k[2] = { k2[11], k2[10], k2[9], k2[8], k1[11], k1[10], k1[9], k1[8] } // k[2] = { k2[15], k2[14], k2[13], k2[12], k1[15], k1[14], k1[13], k1[12] } // // where k1 and k2 are the states corresponding to the first and second block, respectively. // perm_x2 function rearranges k into pk such that pk is equal to: // // pk[0] = { k1[7], k1[6], k1[5], k1[4], k1[3], k1[2], k1[1], k1[0] } // pk[1] = { k1[15], k1[14], k1[13], k1[12], k1[11], k1[10], k1[9], k1[8] } // pk[2] = { k2[7], k2[6], k2[5], k2[4], k2[3], k2[2], k2[1], k2[0] } // pk[2] = { k2[15], k2[14], k2[13], k2[12], k2[11], k2[10], k2[9], k2[8] } // fn perm_x2(reg u256[4] k) -> reg u256[4] { reg u256[4] pk; pk[0] = #x86_VPERM2I128(k[0], k[1], (2u4)[2, 0]); pk[1] = #x86_VPERM2I128(k[2], k[3], (2u4)[2, 0]); pk[2] = #x86_VPERM2I128(k[0], k[1], (2u4)[3, 1]); pk[3] = #x86_VPERM2I128(k[2], k[3], (2u4)[3, 1]); return pk; } // perm_x4 is the same as perm_x2, only difference is that it works for 4 blocks // of stream fn perm_x4(reg u256[4] k1 k2) -> reg u256[4], reg u256[4] { reg u256[4] pk1 pk2; pk1 = perm_x2(k1); pk2 = perm_x2(k2); return pk1, pk2; } // store functions // stores 64 bytes fn store(reg u64 output plain, reg u32 len, reg u256[2] k) -> reg u64, reg u64, reg u32, reg u256[2] { k[0] ^= (u256)[plain + 0]; k[1] ^= (u256)[plain + 32]; (u256)[output + 0] = k[0]; (u256)[output + 32] = k[1]; output, plain, len = update_ptr(output, plain, len, 64); return output, plain, len, k; } // stores up to 64 bytes fn store_last(reg u64 output plain, reg u32 len, reg u256[2] k) { reg u256 r0; reg u128 r1; reg u64 r2 j; reg u8 r3; stack u8[16] s0; r0 = k[0]; if(len >= 32) { r0 ^= (u256)[plain + 0]; (u256)[output + 0] = r0; output, plain, len = update_ptr(output, plain, len, 32); r0 = k[1]; } r1 = #x86_VEXTRACTI128(r0, 0); if(len >= 16) { r1 ^= (u128)[plain + 0]; (u128)[output + 0] = r1; output, plain, len = update_ptr(output, plain, len, 16); r1 = #x86_VEXTRACTI128(r0, 1); } s0[u128 0] = r1; j = 0; while(j < len) { r3 = (u8)[plain + j]; r3 ^= s0[(int)j]; (u8)[output + j] = r3; j += 1; } } // stores 128 bytes fn store_x2(reg u64 output plain, reg u32 len, reg u256[4] k) -> reg u64, reg u64, reg u32, reg u256[4] { inline int i; for i=0 to 4 { k[i] ^= (u256)[plain + 32*i]; } for i=0 to 4 { (u256)[output + 32*i] = k[i]; } output, plain, len = update_ptr(output, plain, len, 128); return output, plain, len, k; } // stores up to 128 bytes fn store_x2_last(reg u64 output plain, reg u32 len, reg u256[4] k) { reg u256[2] r; r[0] = k[0]; r[1] = k[1]; if(len >= 64) { output, plain, len, r = store(output, plain, len, r); r[0] = k[2]; r[1] = k[3]; } store_last(output, plain, len, r); } // stores 256 bytes fn store_x4(reg u64 output plain, reg u32 len, reg u256[8] k) -> reg u64, reg u64, reg u32 { inline int i; for i=0 to 8 { k[i] ^= (u256)[plain + 32*i]; } for i=0 to 8 { (u256)[output + 32*i] = k[i]; } output, plain, len = update_ptr(output, plain, len, 256); return output, plain, len; } // stores up to 256 bytes fn store_x4_last(reg u64 output plain, reg u32 len, reg u256[8] k) { inline int i; reg u256[4] r; for i=0 to 4 { r[i] = k[i]; } if(len >= 128) { output, plain, len, r = store_x2(output, plain, len, r); for i=0 to 4 { r[i] = k[i+4]; } } store_x2_last(output, plain, len, r); } // stores 512 bytes auxiliary functions fn store_half_x8(reg u64 output plain, reg u32 len, reg u256[8] k, inline int o) { inline int i; for i=0 to 8 { k[i] ^= (u256)[plain + o + 64*i]; } for i=0 to 8 { (u256)[output + o + 64*i] = k[i]; } } fn sub_rotate(reg u256[8] t) -> reg u256[8] { inline int i; reg u256[8] x; x[0] = #x86_VPUNPCKL_4u64(t[0], t[1]); x[1] = #x86_VPUNPCKL_4u64(t[2], t[3]); x[2] = #x86_VPUNPCKH_4u64(t[0], t[1]); x[3] = #x86_VPUNPCKH_4u64(t[2], t[3]); x[4] = #x86_VPUNPCKL_4u64(t[4], t[5]); x[5] = #x86_VPUNPCKL_4u64(t[6], t[7]); x[6] = #x86_VPUNPCKH_4u64(t[4], t[5]); x[7] = #x86_VPUNPCKH_4u64(t[6], t[7]); for i=0 to 4 { t[i] = #x86_VPERM2I128(x[2*i+0], x[2*i+1], (2u4)[2,0]); t[i+4] = #x86_VPERM2I128(x[2*i+0], x[2*i+1], (2u4)[3,1]); } return t; } fn rotate(reg u256[8] x) -> reg u256[8] { inline int i; reg u256[8] t; for i=0 to 4 { t[i] = #x86_VPUNPCKL_8u32(x[2*i+0], x[2*i+1]); t[i+4] = #x86_VPUNPCKH_8u32(x[2*i+0], x[2*i+1]); } t = sub_rotate(t); return t; } fn rotate_stack(stack u256[8] s) -> reg u256[8] { inline int i; reg u256[8] t x; for i=0 to 4 { x[i] = s[2*i+0]; } for i=0 to 4 { t[ i] = #x86_VPUNPCKL_8u32(x[i], s[2*i+1]); t[4+i] = #x86_VPUNPCKH_8u32(x[i], s[2*i+1]); } t = sub_rotate(t); return t; } fn rotate_first_half_x8(reg u256[16] k) -> reg u256[8], stack u256[8] { inline int i; stack u256[8] s_k8_15; reg u256[8] k0_7; for i=0 to 8 { s_k8_15[i] = k[8+i]; } for i=0 to 8 { k0_7[i] = k[i]; } k0_7 = rotate(k0_7); return k0_7, s_k8_15; } fn rotate_second_half_x8(stack u256[8] s_k8_15) -> reg u256[8] { inline int i; reg u256[8] k8_15; k8_15 = rotate_stack(s_k8_15); return k8_15; } fn interleave(stack u256[8] s, reg u256[8] k, inline int o) -> reg u256[8] { inline int i; reg u256[8] sk; for i=0 to 4 { sk[2*i+0] = s[o + i]; sk[2*i+1] = k[o + i]; } return sk; } // stores 512 bytes fn store_x8(reg u64 output plain, reg u32 len, reg u256[16] k) -> reg u64, reg u64, reg u32 { stack u256[8] s_k8_15; reg u256[8] k0_7, k8_15; k0_7, s_k8_15 = rotate_first_half_x8(k); store_half_x8(output, plain, len, k0_7, i_0); k8_15 = rotate_second_half_x8(s_k8_15); store_half_x8(output, plain, len, k8_15, i_32); output, plain, len = update_ptr(output, plain, len, 512); return output, plain, len; } // stores up to 512 bytes fn store_x8_last(reg u64 output plain, reg u32 len, reg u256[16] k) { inline int i; stack u256[8] s_k0_7 s_k8_15; reg u256[8] k0_7 k8_15 i0_7; k0_7, s_k8_15 = rotate_first_half_x8(k); s_k0_7 = k0_7; k8_15 = rotate_second_half_x8(s_k8_15); i0_7 = interleave(s_k0_7, k8_15, i_0); if(len >= 256) { output, plain, len = store_x4(output, plain, len, i0_7); i0_7 = interleave(s_k0_7, k8_15, i_4); } store_x4_last(output, plain, len, i0_7); } // rounds related functions // notes about round_x2 (quad_quarter_round_x2 -> round_x2): on how to produce // 128 bytes of stream material // // in the non vectorized implementation, and for given state "k1", the following // operations are performed for the column_round: // // k1 = quarter_round(k1, 0, 4, 8, 12); // k1 = quarter_round(k1, 1, 5, 9, 13); // (as seen in the original spec) // k1 = quarter_round(k1, 2, 6, 10, 14); // k1 = quarter_round(k1, 3, 7, 11, 15); // // which is equivalent to the following when inlining the quarter_round function: // // k1= line(k1, 0, 4, 12, 16); k1= line(k1, 8, 12, 4, 12); k1= line(k1, 0, 4, 12, 8); k1= line(k1, 8, 12, 4, 7); // k1= line(k1, 1, 5, 13, 16); k1= line(k1, 9, 13, 5, 12); k1= line(k1, 1, 5, 13, 8); k1= line(k1, 9, 13, 5, 7); // k1= line(k1, 2, 6, 14, 16); k1= line(k1, 10, 14, 6, 12); k1= line(k1, 2, 6, 14, 8); k1= line(k1, 10, 14, 6, 7); // k1= line(k1, 3, 7, 15, 16); k1= line(k1, 11, 15, 7, 12); k1= line(k1, 3, 7, 15, 8); k1= line(k1, 11, 15, 7, 7); // // with the line(k1,a,b,c,r) function defined as: // // k1[a] += k1[b]; // k1[c] ^= k1[a]; // k1[c] = #x86_ROL_32(k1[c], r); // // in order to process 2 blocks at once we can thus arrange the state of an "k" // register array, with type u256[4], to contain 2 states k1 and k2, for the first // and second block, respectively: // // let the state for block 1 be "u32[16] k1" and "u32[16] k2" for block 2. then: // // k[0] = { k2[3], k2[2], k2[1], k2[0], k1[3], k1[2], k1[1], k1[0] } // k[1] = { k2[7], k2[6], k2[5], k2[4], k1[7], k1[6], k1[5], k1[4] } // k[2] = { k2[11], k2[10], k2[9], k2[8], k1[11], k1[10], k1[9], k1[8] } // k[2] = { k2[15], k2[14], k2[13], k2[12], k1[15], k1[14], k1[13], k1[12] } // // and, for example, if we compute: // // k[0] +8u32= k[1]; // k[3] ^= k[0]; // k[3] = #x86_VPSHUFB_256(k[3], r16); // // with r16 being (32u8)[13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2, 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2] // that is the same as computing: // // k1= line(k1, 0, 4, 12, 16); // k1= line(k1, 1, 5, 13, 16); // k1= line(k1, 2, 6, 14, 16); // k1= line(k1, 3, 7, 15, 16); // // k2= line(k2, 0, 4, 12, 16); // k2= line(k2, 1, 5, 13, 16); // k2= line(k2, 2, 6, 14, 16); // k2= line(k2, 3, 7, 15, 16); // // we are then vectorizing "vertically" (across quarter_round's) and "horizontally" (blocks). // // // Let's now define some auxiliary functions to help define round_x2. // // We know that we need to perform some rotates (to the left) on some parts of the // state by 16, 12, 8, and 7. // // rotate by 16 and 8 are "easy" to do: we can use the shuffle instruction to achieve that. // As said the in Intel Intrinsics documentation regarding vpshufb, this instruction // "shuffle 8-bit integers in a within 128-bit lanes according to shuffle control // mask in the corresponding 8-bit element of b, and store the results in dst." // // so if we need the following mask for rotating by 16: // (32u8)[13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2, 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2] // // and the following mask to rotate by 8: // (32u8)[14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3, 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3]; // // in order to rotate by 12 and 7 it is necessary to perform a shift left, right and then or or xor // we can define a rotate_x8 function to abstract the previously described behaviours: fn rotate_x8(reg u256[4] k, inline int i r, reg u256 r16 r8) -> reg u256[4] { reg u256 t; if(r==16){ k[i] = #x86_VPSHUFB_256(k[i], r16); } else { if (r==8) { k[i] = #x86_VPSHUFB_256(k[i], r8); } else { t = k[i] <<8u32 r; k[i] = k[i] >>8u32 (32-r); k[i] ^= t; }} return k; } // line_x8 function performs 8 lines computations across "quarter_round's" and // "blocks" as previously described fn line_x8(reg u256[4] k, inline int a b c r, reg u256 r16 r8) -> reg u256[4] { k[a/4] +8u32= k[b/4]; k[c/4] ^= k[a/4]; k = rotate_x8(k, (c/4), r, r16, r8); return k; } // and finally we can define the quad_quarter_round_x2, or just round_x2, function. // // if we look at the previously inline of quarter_round we can read the following line: // k1= line(k1, 0, 4, 12, 16); k1= line(k1, 8, 12, 4, 12); k1= line(k1, 0, 4, 12, 8); k1= line(k1, 8, 12, 4, 7); // // since we wrote line_x8 with /4 at the indexes we can just define round_x2 as: // fn round_x2(reg u256[4] k, reg u256 r16 r8) -> reg u256[4] { k = line_x8(k, 0, 4, 12, 16, r16, r8); k = line_x8(k, 8, 12, 4, 12, r16, r8); k = line_x8(k, 0, 4, 12, 8, r16, r8); k = line_x8(k, 8, 12, 4, 7, r16, r8); return k; } // and the function column_round_x2 consists in just a call to round_x2 fn column_round_x2(reg u256[4] k, reg u256 r16 r8) -> reg u256[4] { k = round_x2(k, r16, r8); return k; } // but for the diagonal_round_x2 we need to change how the state is organized. // First, let's see what a diagonal_round is in a non-vectorized implementation: // // k1 = quarter_round(k1, 0, 5, 10, 15); // k1 = quarter_round(k1, 1, 6, 11, 12); // k1 = quarter_round(k1, 2, 7, 8, 13); // k1 = quarter_round(k1, 3, 4, 9, 14); // // so it is basically a column_round: // // k1 = quarter_round(k1, 0, 4, 8, 12); // k1 = quarter_round(k1, 1, 5, 9, 13); // k1 = quarter_round(k1, 2, 6, 10, 14); // k1 = quarter_round(k1, 3, 7, 11, 15); // // where 5 is 4, 10 is an 8, 15 is a 12... and so on. So it is only needed to rearrange the state: // // 4, 5, 6, 7 -> 5, 6, 7, 4 // 8, 9, 10, 11 -> 10, 11, 8, 9 // 12, 13, 14, 15 -> 15, 12, 13, 14 // // As such, we want our state k to be: // // k[0] = { k2[3], k2[2], k2[1], k2[0], k1[3], k1[2], k1[1], k1[0] } // k[1] = { k2[4], k2[7], k2[6], k2[5], k1[4], k1[7], k1[6], k1[5] } // k[2] = { k2[9], k2[8], k2[11], k2[10], k1[9], k1[8], k1[11], k1[10] } // k[2] = { k2[14], k2[13], k2[12], k2[15], k1[14], k1[13], k1[12], k1[15] } // // this transformation of the state can be achieved with the following function: fn shuffle_state(reg u256[4] k) -> reg u256[4] { k[1] = #x86_VPSHUFD_256(k[1], (4u2)[0,3,2,1]); k[2] = #x86_VPSHUFD_256(k[2], (4u2)[1,0,3,2]); k[3] = #x86_VPSHUFD_256(k[3], (4u2)[2,1,0,3]); return k; } // and we also need the inverse function to put everything back into its place: fn reverse_shuffle_state(reg u256[4] k) -> reg u256[4] { k[1] = #x86_VPSHUFD_256(k[1], (4u2)[2,1,0,3]); k[2] = #x86_VPSHUFD_256(k[2], (4u2)[1,0,3,2]); k[3] = #x86_VPSHUFD_256(k[3], (4u2)[0,3,2,1]); return k; } // finally, diagonal_round_x2 can be defined as: fn diagonal_round_x2(reg u256[4] k, reg u256 r16 r8) -> reg u256[4] { k = shuffle_state(k); k = round_x2(k, r16, r8); k = reverse_shuffle_state(k); return k; } // and the rounds_x2 function as: fn rounds_x2(reg u256[4] k) -> reg u256[4] { reg u64 c; reg u256 r16 r8; r16 = g_r16; r8 = g_r8; c = 0; while(c < 10) { k = column_round_x2(k, r16, r8); k = diagonal_round_x2(k, r16, r8); c += 1; } return k; } // notes about round_x4 (quad_quarter_round_x4 -> round_x4): on how to produce 256 bytes of stream material. // // In here the story repeats itself and the strategy is pretty much the same as the // one described for round_x2. // // round_x4 goes through 4 blocks at a time: fn round_x4(reg u256[4] k1 k2, reg u256 r16 r8) -> reg u256[4], reg u256[4] { k1 = round_x2(k1, r16, r8); k2 = round_x2(k2, r16, r8); return k1, k2; } // the following functions are just reusing previous definitions for rounds_x2 fn column_round_x4(reg u256[4] k1 k2, reg u256 r16 r8) -> reg u256[4], reg u256[4] { k1, k2 = round_x4(k1, k2, r16, r8); return k1, k2; } fn shuffle_state_x2(reg u256[4] k1 k2) -> reg u256[4], reg u256[4] { k1 = shuffle_state(k1); k2 = shuffle_state(k2); return k1, k2; } fn reverse_shuffle_state_x2(reg u256[4] k1 k2) -> reg u256[4], reg u256[4] { k1 = reverse_shuffle_state(k1); k2 = reverse_shuffle_state(k2); return k1, k2; } fn diagonal_round_x4(reg u256[4] k1 k2, reg u256 r16 r8) -> reg u256[4], reg u256[4] { k1, k2 = shuffle_state_x2(k1, k2); k1, k2 = round_x4(k1, k2, r16, r8); k1, k2 = reverse_shuffle_state_x2(k1, k2); return k1, k2; } fn rounds_x4(reg u256[4] k1 k2) -> reg u256[4], reg u256[4] { reg u64 c; reg u256 r16 r8; r16 = g_r16; r8 = g_r8; c = 0; while(c < 10) { k1, k2 = column_round_x4(k1, k2, r16, r8); k1, k2 = diagonal_round_x4(k1, k2, r16, r8); c += 1; } return k1, k2; } // notes about round_x8: on how to produce 512 bytes of stream material. // // 512 bytes are 8 times 64, which means that we are evaluating 8 blocks at a time. // Since the number of ymm registers limited to 16, and in the previous approach we // require 2 registers for the r16 r8 masks and 1 register to be a temporary value // for the rotates, this means that there would be only 13 registers for the state, // which means that too many memory spills would have to happen. For this function // we are going to forget about "vertical" vectorization (see section regarding round_x2) // and just focus on vectorizing across blocks. We are going to use a strategy identical // to the one used in ref implementation, were the quarter_rounds are swapped so the // number of memory spills is reduced. // // So let k1 to k8 be the state for block 1 to 8, respectively. We want to have a k such that: // // k[0] = {k8[0], k7[0], k6[0], k5[0], k4[0], k3[0], k2[0], k1[0]} // k[1] = {k8[1], k7[1], k6[1], k5[1], k4[1], k3[1], k2[1], k1[1]} // k[2] = {k8[2], k7[2], k6[2], k5[2], k4[2], k3[2], k2[2], k1[2]} // k[3] = {k8[3], k7[3], k6[3], k5[3], k4[3], k3[3], k2[3], k1[3]} // k[4] = {k8[4], k7[4], k6[4], k5[4], k4[4], k3[4], k2[4], k1[4]} // k[5] = {k8[5], k7[5], k6[5], k5[5], k4[5], k3[5], k2[5], k1[5]} // k[6] = {k8[6], k7[6], k6[6], k5[6], k4[6], k3[6], k2[6], k1[6]} // k[7] = {k8[7], k7[7], k6[7], k5[7], k4[7], k3[7], k2[7], k1[7]} // k[8] = {k8[8], k7[8], k6[8], k5[8], k4[8], k3[8], k2[8], k1[8]} // k[9] = {k8[9], k7[9], k6[9], k5[9], k4[9], k3[9], k2[9], k1[9]} // k[10] = {k8[10], k7[10], k6[10], k5[10], k4[10], k3[10], k2[10], k1[10]} // k[11] = {k8[11], k7[11], k6[11], k5[11], k4[11], k3[11], k2[11], k1[11]} // k[12] = {k8[12], k7[12], k6[12], k5[12], k4[12], k3[12], k2[12], k1[12]} // k[13] = {k8[13], k7[13], k6[13], k5[13], k4[13], k3[13], k2[13], k1[13]} // k[14] = {k8[14], k7[14], k6[14], k5[14], k4[14], k3[14], k2[14], k1[14]} // k[15] = {k8[15], k7[15], k6[15], k5[15], k4[15], k3[15], k2[15], k1[15]} // // // since we want to have as much free registers as possible we just need an alternative // definition of rotate_x8 that receives r16 and r8 as stack values (instead of registers): fn rotate_x8_s(reg u256[16] k, inline int i r, stack u256 r16 r8) -> reg u256[16] { reg u256 t; if(r==16){ k[i] = #x86_VPSHUFB_256(k[i], r16); } else { if (r==8) { k[i] = #x86_VPSHUFB_256(k[i], r8); } else { t = k[i] <<8u32 r; k[i] = k[i] >>8u32 (32-r); k[i] ^= t; }} return k; } // we can now define a function _line_x8_v such that: fn _line_x8_v(reg u256[16] k, inline int a b c r, stack u256 r16 r8) -> reg u256[16] { k[a] +8u32= k[b]; k[c] ^= k[a]; k = rotate_x8_s(k, c, r, r16, r8); return k; } // and now a quarter_round function: (not used for compilation, just here for the // sake of the explanation): /* fn _quarter_round_x8(reg u256[16] k, inline int a b c d, stack u256 r16 r8) -> reg u256[16] { k = _line_x8_v(k, a, b, d, 16, r16, r8); k = _line_x8_v(k, c, d, b, 12, r16, r8); k = _line_x8_v(k, a, b, d, 8, r16, r8); k = _line_x8_v(k, c, d, b, 7, r16, r8); return k; } */ // you can check the definition of rounds_x8 down bellow. But imagine that you want to // do some micro optimizations: let's say you wanted to merge both _quarter_round_x8 into // one double_quarter_round_x8 such that line_x8_v performs as follows: fn line_x8_v(reg u256[16] k, inline int a0 b0 c0 r0 a1 b1 c1 r1, stack u256 r16 r8) -> reg u256[16] { k[a0] +8u32= k[b0]; k[a1] +8u32= k[b1]; k[c0] ^= k[a0]; k[c1] ^= k[a1]; k = rotate_x8_s(k, c0, r0, r16, r8); k = rotate_x8_s(k, c1, r1, r16, r8); return k; } // and double quarter_round could be defined as (not used for compilation, just // here for the sake of the explanation): /* fn _double_quarter_round_x8(reg u256[16] k, inline int a0 b0 c0 d0 a1 b1 c1 d1, stack u256 r16 r8) -> reg u256[16] { k = line_x8_v(k, a0, b0, d0, 16, a1, b1, d1, 16, r16, r8); k = line_x8_v(k, c0, d0, b0, 12, c1, d1, b1, 12, r16, r8); k = line_x8_v(k, a0, b0, d0, 8, a1, b1, d1, 8, r16, r8); k = line_x8_v(k, c0, d0, b0, 7, c1, d1, b1, 7, r16, r8); return k; } */ // but imagine that you don't want to execute 2 shuffles (from memory operands) // next to each other... // we can just define a variant of the double_quarter_round_x8 such that: fn double_quarter_round_x8(reg u256[16] k, inline int a0 b0 c0 d0 a1 b1 c1 d1, stack u256 r16 r8) -> reg u256[16] { k = _line_x8_v(k, a0, b0, d0, 16, r16, r8); k = line_x8_v(k, c0, d0, b0, 12, a1, b1, d1, 16, r16, r8); k = line_x8_v(k, a0, b0, d0, 8, c1, d1, b1, 12, r16, r8); k = line_x8_v(k, c0, d0, b0, 7, a1, b1, d1, 8, r16, r8); k = _line_x8_v(k, c1, d1, b1, 7, r16, r8); return k; } fn column_round_x8(reg u256[16] k, stack u256 k15 s_r16 s_r8) -> reg u256[16], stack u256 { stack u256 k_; k = double_quarter_round_x8(k, 0, 4, 8, 12, 2, 6, 10, 14, s_r16, s_r8); k[15] = k15; k_ = k[14]; k = double_quarter_round_x8(k, 1, 5, 9, 13, 3, 7, 11, 15, s_r16, s_r8); return k, k_; } fn diagonal_round_x8(reg u256[16] k, stack u256 k14 s_r16 s_r8) -> reg u256[16], stack u256 { stack u256 k_; k = double_quarter_round_x8(k, 1, 6, 11, 12, 0, 5, 10, 15, s_r16, s_r8); k[14] = k14; k_ = k[15]; k = double_quarter_round_x8(k, 2, 7, 8, 13, 3, 4, 9, 14, s_r16, s_r8); return k, k_; } fn rounds_x8(reg u256[16] k, stack u256 s_r16 s_r8) -> reg u256[16] { reg u64 c; reg bool zf; stack u256 k15 k14; k15 = k[15]; //c = 0; //while(c < 10) c = 10; align while { k, k14 = column_round_x8(k, k15, s_r16, s_r8); k, k15 = diagonal_round_x8(k, k14, s_r16, s_r8); (_,_,_,zf,c) = #x86_DEC(c); } (!zf) //c += 1; //} k[15] = k15; return k; } fn chacha20_more_than_256(reg u64 output plain, reg u32 len, reg u64 key nonce, reg u32 counter) { stack u256[16] st; reg u256[16] k; stack u256 s_r16 s_r8; s_r16, s_r8 = load_shufb_cmd(); st = init_x8(key, nonce, counter); while(len >= 512) { k = copy_state_x8(st); k = rounds_x8(k, s_r16, s_r8); k = sum_states_x8(k, st); output, plain, len = store_x8(output, plain, len, k); st = increment_counter_x8(st); } if(len > 0) { k = copy_state_x8(st); k = rounds_x8(k, s_r16, s_r8); k = sum_states_x8(k, st); store_x8_last(output, plain, len, k); } } fn chacha20_less_than_257(reg u64 output plain, reg u32 len, reg u64 key nonce, reg u32 counter) { reg u256[4] st k1 k2; st = init_x2(key, nonce, counter); if(len > 128) { k1, k2 = copy_state_x4(st); k1, k2 = rounds_x4(k1, k2); k1, k2 = sum_states_x4(k1, k2, st); k1, k2 = perm_x4(k1, k2); output, plain, len, k1 = store_x2(output, plain, len, k1); store_x2_last(output, plain, len, k2); } else { k1 = copy_state_x2(st); k1 = rounds_x2(k1); k1 = sum_states_x2(k1, st); k1 = perm_x2(k1); store_x2_last(output, plain, len, k1); } } export fn chacha20_avx2(reg u64 output plain, reg u32 len, reg u64 key nonce, reg u32 counter) { if(len < 257) { chacha20_less_than_257(output, plain, len, key, nonce, counter); } else { chacha20_more_than_256(output, plain, len, key, nonce, counter); } }