#include "tensor_ops.cuh" #include #define BLOCK_SIZE 256 __global__ void scalar_mul_kernel(float *output, const float *input, const float scalar, const int size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { output[idx] = input[idx] * scalar; } } extern "C" { void tensor_scalar_mul(float *output, const float *input, const float scalar, const int size) { int num_blocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE; scalar_mul_kernel<<>>(output, input, scalar, size); cudaDeviceSynchronize(); } }