#include "poseidon2_constants.cuh" #include "stdio.h" #define CELLS 24 #define ROUNDS_FULL 8 #define ROUNDS_HALF_FULL (ROUNDS_FULL / 2) #define ROUNDS_PARTIAL 21 #define ROW_SIZE (CELLS + ROUNDS_PARTIAL) #define CELLS_RATE 16 #define CELLS_OUT 8 struct __align__(CELLS_OUT % 4 == 0 ? 16 : 4) poseidon_out_t { fr_t data[CELLS_OUT]; }; struct __align__(CELLS_RATE % 4 == 0 ? 16 : 4) poseidon_in_t { fr_t data[CELLS_RATE]; }; namespace poseidon2 { __device__ __forceinline__ void pow7_cells_full(fr_t cells[CELLS]) { #pragma unroll for (uint32_t i = 0; i < CELLS; i++) { cells[i] ^= 7; } } __device__ __forceinline__ void multiply_by_m_int(fr_t cells[CELLS]) { fr_t sum = 0; #pragma unroll for (uint32_t i = 0; i < CELLS; i++) { sum += cells[i]; } #pragma unroll for (uint32_t i = 0; i < CELLS; i++) { cells[i] = sum + M_INT_DIAG_HZN[i] * cells[i]; } } __device__ __forceinline__ void multiply_by_4x4_circulant(fr_t x[4]) { // See appendix B of Poseidon2 paper. fr_t t0 = x[0] + x[1]; fr_t t1 = x[2] + x[3]; fr_t t2 = x[1] + x[1] + t1; fr_t t3 = x[3] + x[3] + t0; fr_t t4 = fr_t(4) * t1 + t3; fr_t t5 = fr_t(4) * t0 + t2; fr_t t6 = t3 + t5; fr_t t7 = t2 + t4; x[0] = t6; x[1] = t5; x[2] = t7; x[3] = t4; } __device__ __forceinline__ void multiply_by_m_ext(fr_t cells[CELLS]) { // Optimized method for multiplication by M_EXT. // See appendix B of Poseidon2 paper for additional details. fr_t tmp_sums[4] = {0}; for (uint32_t i = 0; i < CELLS / 4; i++) { fr_t* tmp = cells + i * 4; multiply_by_4x4_circulant(tmp); for (uint32_t j = 0; j < 4; j++) { tmp_sums[j] += tmp[j]; } } for (uint32_t i = 0; i < CELLS; i++) { cells[i] += tmp_sums[i % 4]; } } __device__ __forceinline__ void full_round(fr_t cells[CELLS], uint32_t round_constants_off) { #pragma unroll for (uint32_t i = 0; i < CELLS; i++) { cells[i] += ROUND_CONSTANTS[round_constants_off + i]; } pow7_cells_full(cells); multiply_by_m_ext(cells); } __device__ __forceinline__ void partial_round(fr_t cells[CELLS], uint32_t round_constants_off) { cells[0] += ROUND_CONSTANTS[round_constants_off]; cells[0] ^= 7; multiply_by_m_int(cells); } __device__ __forceinline__ void poseidon2_mix(fr_t cells[CELLS]) { uint32_t round_constants_off = 0; // First linear layer. multiply_by_m_ext(cells); #pragma unroll 1 for (uint32_t i = 0; i < ROUNDS_HALF_FULL * 2 + ROUNDS_PARTIAL; i++) { if (i < ROUNDS_HALF_FULL || i >= ROUNDS_HALF_FULL + ROUNDS_PARTIAL) { full_round(cells, round_constants_off); round_constants_off += CELLS; } else { partial_round(cells, round_constants_off); round_constants_off++; } } } } // namespace poseidon2 __launch_bounds__(256, 4) __global__ void _poseidon2_fold(poseidon_out_t* output, const poseidon_in_t* input, uint32_t output_size) { uint32_t gid = blockDim.x * blockIdx.x + threadIdx.x; fr_t cells[CELLS]; #pragma unroll for (uint32_t i = 0; i < CELLS; i++) { cells[i] = 0; } poseidon_in_t in = input[gid]; #pragma unroll for (size_t i = 0; i < CELLS_RATE; i++) { cells[i] = in.data[i]; } poseidon2::poseidon2_mix(cells); poseidon_out_t tmp; #pragma unroll for (uint32_t i = 0; i < CELLS_OUT; i++) { tmp.data[i] = cells[i]; } output[gid] = tmp; } __launch_bounds__(256, 4) __global__ void _poseidon2_rows(poseidon_out_t* out, const fr_t* matrix, uint32_t dim_x, uint32_t dim_y) { uint32_t gid = blockDim.x * blockIdx.x + threadIdx.x; if (gid >= dim_x) return; fr_t cells[CELLS] = {0}; matrix += gid; uint32_t i = 0; do { #pragma unroll for (uint32_t j = 0; j < CELLS_RATE; j++, i++) cells[j] = i < dim_y ? matrix[i * dim_x] : fr_t{0}; poseidon2::poseidon2_mix(cells); } while (i < dim_y); poseidon_out_t tmp; #pragma unroll for (uint32_t i = 0; i < CELLS_OUT; i++) { tmp.data[i] = cells[i]; } out[gid] = tmp; }