__global__ void convolution(float* input, int inputRows, int inputCols, int inputLd, float* kernel, int kernelRows, int kernelCols, int kernelLd, int rowStep, int colStep, float* output, int outputLd) { int row = (blockIdx.y * blockDim.y + threadIdx.y) * rowStep; int col = (blockIdx.x * blockDim.x + threadIdx.x) * colStep; if (row <= inputRows - kernelRows && col <= inputCols - kernelCols) { int i, j; output[row+col*outputLd] = 0; for (i=0; i>> (input, inputRows, inputCols, inputLd, kernel, kernelRows, kernelCols, kernelLd, rowStep, colStep, output, outputLd); } }