#include #include #define PI (3.141592653589793238462643) #ifdef AUTD3_USE_METER #define MILLIMETER (0.001) #else #define MILLIMETER (1.0) #endif #define T4010A1_AMP (275.574246625 * 200.0 / 4 / PI * MILLIMETER) __device__ float absc2(const cuFloatComplex x) { return x.x * x.x + x.y * x.y; } __device__ float absc(const cuFloatComplex x) { return sqrt(absc2(x)); } __device__ cuFloatComplex conj(const cuFloatComplex a) { return make_cuFloatComplex(a.x, -a.y); } __device__ cuFloatComplex mulc(const cuFloatComplex a, const cuFloatComplex b) { return make_cuFloatComplex(a.x * b.x - a.y * b.y, a.x * b.y + a.y * b.x); } __device__ cuFloatComplex mulcr(const cuFloatComplex a, const float b) { return make_cuFloatComplex(a.x * b, a.y * b); } __device__ cuFloatComplex divcr(const cuFloatComplex x, const float y) { const float r = x.x / y; const float i = x.y / y; return make_cuFloatComplex(r, i); } __global__ void scaled_to_kernel(const cuFloatComplex *a, const cuFloatComplex *b, uint32_t row, uint32_t col, cuFloatComplex *c) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yi = blockIdx.y * blockDim.y + threadIdx.y; if (xi >= col || yi >= row) return; unsigned int i = yi + xi * row; c[i] = mulc(divcr(a[i], absc(a[i])), b[i]); } __global__ void get_diagonal_kernel(const cuFloatComplex *a, uint32_t row, uint32_t col, cuFloatComplex *b) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yi = blockIdx.y * blockDim.y + threadIdx.y; if (xi >= col || yi >= row) return; if (xi == yi) { unsigned int idx = yi + xi * row; b[xi] = a[idx]; } } __global__ void get_diagonal_kernel(const float *a, uint32_t row, uint32_t col, float *b) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yi = blockIdx.y * blockDim.y + threadIdx.y; if (xi >= col || yi >= row) return; if (xi == yi) { unsigned int idx = yi + xi * row; b[xi] = a[idx]; } } __global__ void set_diagonal_kernel_c(const cuFloatComplex *a, uint32_t n, cuFloatComplex *b) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; if (xi >= n) return; unsigned int idx = xi + xi * n; b[idx] = a[xi]; } __global__ void set_diagonal_kernel(const float *a, uint32_t n, float *b) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; if (xi >= n) return; unsigned int idx = xi + xi * n; b[idx] = a[xi]; } __global__ void reciprocal_kernel(const cuFloatComplex *a, const uint32_t row, const uint32_t col, cuFloatComplex *b) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yi = blockIdx.y * blockDim.y + threadIdx.y; if (xi >= col || yi >= row) return; unsigned int idx = yi + xi * row; float s = absc2(a[idx]); const float x = a[idx].x / s; const float y = -a[idx].y / s; b[idx] = make_cuFloatComplex(x, y); } __global__ void hadamard_product_kernel(const cuFloatComplex *a, const cuFloatComplex *b, const uint32_t row, const uint32_t col, cuFloatComplex *c) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yi = blockIdx.y * blockDim.y + threadIdx.y; if (xi >= col || yi >= row) return; unsigned int idx = yi + xi * row; c[idx] = mulc(a[idx], b[idx]); } __global__ void norm_squared_kernel(const cuFloatComplex *a, const uint32_t row, const uint32_t col, float *b) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yi = blockIdx.y * blockDim.y + threadIdx.y; if (xi >= col || yi >= row) return; unsigned int idx = yi + xi * row; b[idx] = absc2(a[idx]); } __global__ void make_complex_kernel(const float *re, const uint32_t row, const uint32_t col, cuFloatComplex *dst) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yi = blockIdx.y * blockDim.y + threadIdx.y; if (xi >= col || yi >= row) return; unsigned int idx = yi + xi * row; dst[idx] = make_cuFloatComplex(re[idx], 0); } __global__ void make_complex2_kernel(const float *re, const float *im, const uint32_t row, const uint32_t col, cuFloatComplex *dst) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yi = blockIdx.y * blockDim.y + threadIdx.y; if (xi >= col || yi >= row) return; unsigned int idx = yi + xi * row; dst[idx] = make_cuFloatComplex(re[idx], im[idx]); } __global__ void conj_kernel(const cuFloatComplex *a, const uint32_t row, const uint32_t col, cuFloatComplex *b) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yi = blockIdx.y * blockDim.y + threadIdx.y; if (xi >= col || yi >= row) return; unsigned int idx = yi + xi * row; b[idx] = conj(a[idx]); } __device__ cuFloatComplex expc(const cuFloatComplex x) { const float s = exp(x.x); const float r = cos(x.y); const float i = sin(x.y); return make_cuFloatComplex(s * r, s * i); } __global__ void exp_kernel(const cuFloatComplex *a, const uint32_t row, const uint32_t col, cuFloatComplex *b) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yi = blockIdx.y * blockDim.y + threadIdx.y; if (xi >= col || yi >= row) return; unsigned int idx = yi + xi * row; b[idx] = expc(a[idx]); } __global__ void real_kernel(const cuFloatComplex *src, const uint32_t row, const uint32_t col, float *dst) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yi = blockIdx.y * blockDim.y + threadIdx.y; if (xi >= col || yi >= row) return; unsigned int idx = yi + xi * row; dst[idx] = src[idx].x; } __global__ void imag_kernel(const cuFloatComplex *src, const uint32_t row, const uint32_t col, float *dst) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yi = blockIdx.y * blockDim.y + threadIdx.y; if (xi >= col || yi >= row) return; unsigned int idx = yi + xi * row; dst[idx] = src[idx].y; } __global__ void col_sum_kernel(const float *din, uint32_t m, uint32_t n, float *dout) { uint32_t row = blockIdx.y * blockDim.y + threadIdx.y; if (row >= m) return; float sum = 0; for (uint32_t col = 0; col < n; col++) sum += din[col * m + row]; dout[row] = sum; } __global__ void generate_propagation_matrix_kernel( const float *positions, const float *foci, const float *wavenums, const uint32_t row, const uint32_t col, cuFloatComplex *dst) { unsigned int xi = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yi = blockIdx.y * blockDim.y + threadIdx.y; if (xi >= col || yi >= row) return; float xd = foci[3 * yi] - positions[3 * xi]; float yd = foci[3 * yi + 1] - positions[3 * xi + 1]; float zd = foci[3 * yi + 2] - positions[3 * xi + 2]; float dist = sqrt(xd * xd + yd * yd + zd * zd); float r = T4010A1_AMP / dist; float phase = wavenums[xi] * dist; dst[yi + xi * row] = make_cuFloatComplex(r * cos(phase), r * sin(phase)); } #ifdef __cplusplus extern "C" { #endif #define BLOCK_SIZE (32) void cu_scaled_to(const cuFloatComplex *a, const cuFloatComplex *b, const uint32_t row, const uint32_t col, cuFloatComplex *c) { dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid((col - 1) / BLOCK_SIZE + 1, (row - 1) / BLOCK_SIZE + 1, 1); scaled_to_kernel<<>>(a, b, row, col, c); } void cu_get_diagonal(const float *a, const uint32_t row, const uint32_t col, float *b) { dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid((col - 1) / BLOCK_SIZE + 1, (row - 1) / BLOCK_SIZE + 1, 1); get_diagonal_kernel<<>>(a, row, col, b); } void cu_get_diagonal_c(const cuFloatComplex *a, const uint32_t row, const uint32_t col, cuFloatComplex *b) { dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid((col - 1) / BLOCK_SIZE + 1, (row - 1) / BLOCK_SIZE + 1, 1); get_diagonal_kernel<<>>(a, row, col, b); } void cu_set_diagonal(const float *a, const uint32_t n, float *b) { dim3 block(BLOCK_SIZE * BLOCK_SIZE, 1, 1); dim3 grid((n - 1) / (BLOCK_SIZE * BLOCK_SIZE) + 1, 1, 1); set_diagonal_kernel<<>>(a, n, b); } void cu_set_diagonal_c(const cuFloatComplex *a, const uint32_t n, cuFloatComplex *b) { dim3 block(BLOCK_SIZE * BLOCK_SIZE, 1, 1); dim3 grid((n - 1) / (BLOCK_SIZE * BLOCK_SIZE) + 1, 1, 1); set_diagonal_kernel_c<<>>(a, n, b); } void cu_reciprocal(const cuFloatComplex *a, const uint32_t row, const uint32_t col, cuFloatComplex *b) { dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid((col - 1) / BLOCK_SIZE + 1, (row - 1) / BLOCK_SIZE + 1, 1); reciprocal_kernel<<>>(a, row, col, b); } void cu_hadamard_product(const cuFloatComplex *a, const cuFloatComplex *b, const uint32_t row, const uint32_t col, cuFloatComplex *c) { dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid((col - 1) / BLOCK_SIZE + 1, (row - 1) / BLOCK_SIZE + 1, 1); hadamard_product_kernel<<>>(a, b, row, col, c); } void cu_norm_squared(const cuFloatComplex *a, const uint32_t row, const uint32_t col, float *b) { dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid((col - 1) / BLOCK_SIZE + 1, (row - 1) / BLOCK_SIZE + 1, 1); norm_squared_kernel<<>>(a, row, col, b); } void cu_make_complex(const float *re, const uint32_t row, const uint32_t col, cuFloatComplex *dst) { dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid((col - 1) / BLOCK_SIZE + 1, (row - 1) / BLOCK_SIZE + 1, 1); make_complex_kernel<<>>(re, row, col, dst); } void cu_make_complex2(const float *re, const float *im, const uint32_t row, const uint32_t col, cuFloatComplex *dst) { dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid((col - 1) / BLOCK_SIZE + 1, (row - 1) / BLOCK_SIZE + 1, 1); make_complex2_kernel<<>>(re, im, row, col, dst); } void cu_conj(const cuFloatComplex *a, const uint32_t row, const uint32_t col, cuFloatComplex *b) { dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid((col - 1) / BLOCK_SIZE + 1, (row - 1) / BLOCK_SIZE + 1, 1); conj_kernel<<>>(a, row, col, b); } void cu_exp(const cuFloatComplex *a, const uint32_t row, const uint32_t col, cuFloatComplex *b) { dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid((col - 1) / BLOCK_SIZE + 1, (row - 1) / BLOCK_SIZE + 1, 1); exp_kernel<<>>(a, row, col, b); } void cu_real(const cuFloatComplex *src, const uint32_t row, const uint32_t col, float *dst) { dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid((col - 1) / BLOCK_SIZE + 1, (row - 1) / BLOCK_SIZE + 1, 1); real_kernel<<>>(src, row, col, dst); } void cu_imag(const cuFloatComplex *src, const uint32_t row, const uint32_t col, float *dst) { dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid((col - 1) / BLOCK_SIZE + 1, (row - 1) / BLOCK_SIZE + 1, 1); imag_kernel<<>>(src, row, col, dst); } void cu_reduce_col(const float *mat, const uint32_t m, const uint32_t n, float *result) { dim3 block(1, BLOCK_SIZE * BLOCK_SIZE, 1); dim3 grid(1, (m - 1) / (BLOCK_SIZE * BLOCK_SIZE) + 1, 1); col_sum_kernel<<>>(mat, m, n, result); } void cu_generate_propagation_matrix(const float *positions, const float *foci, const float *wavenums, const uint32_t row, const uint32_t col, cuFloatComplex *dst) { dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid((col - 1) / BLOCK_SIZE + 1, (row - 1) / BLOCK_SIZE + 1, 1); generate_propagation_matrix_kernel<<>>(positions, foci, wavenums, row, col, dst); } #ifdef __cplusplus } #endif