#include using namespace metal; kernel void vector_add( device const float* a [[buffer(0)]], device const float* b [[buffer(1)]], device float* result [[buffer(2)]], uint index [[thread_position_in_grid]]) { result[index] = a[index] + b[index]; } kernel void vector_multiply( device const float* a [[buffer(0)]], device const float* b [[buffer(1)]], device float* result [[buffer(2)]], uint index [[thread_position_in_grid]]) { result[index] = a[index] * b[index]; } kernel void vector_relu( device const float* input [[buffer(0)]], device float* output [[buffer(1)]], uint index [[thread_position_in_grid]]) { output[index] = max(0.0f, input[index]); }