#include #include #define BLOCK_SIZE 32 __global__ void transpose(float *input, float *output, size_t N) { __shared__ float sharedMemory [BLOCK_SIZE] [BLOCK_SIZE]; // global index int i = threadIdx.x + blockIdx.x * blockDim.x; int j = threadIdx.y + blockIdx.y * blockDim.y; // transposed global memory index int ti = threadIdx.x + blockIdx.y * blockDim.x; int tj = threadIdx.y + blockIdx.x * blockDim.y; // local index int local_i = threadIdx.x; int local_j = threadIdx.y; if (i < N && j < N) { // reading from global memory in coalesed manner and performing tanspose in shared memory int index = j * N + i; sharedMemory[local_i][local_j] = input[index]; } else { sharedMemory[local_i][local_j] = 0.0; } __syncthreads(); if (ti < N && tj < N) { // writing into global memory in coalesed fashion via transposed data in shared memory int transposedIndex = tj * N + ti; output[transposedIndex] = sharedMemory[local_j][local_i]; } }