/* * Copyright 2010-2023 Branimir Karadzic. All rights reserved. * License: https://github.com/bkaradzic/bx/blob/master/LICENSE */ #include #include #include #include #include static void flushCache() { static uint32_t length = 1 << 26; static uint8_t* input = new uint8_t[length]; static uint8_t* output = new uint8_t[length]; bx::memCopy(output, input, length); } typedef bx::simd128_t (*SimdRsqrtFn)(bx::simd128_t _a); template void simd_rsqrt_bench(bx::simd128_t* _dst, bx::simd128_t* _src, uint32_t _numVertices) { for (uint32_t ii = 0, num = _numVertices/4; ii < num; ++ii) { bx::simd128_t* ptr = &_src[ii*4]; bx::simd128_t tmp0 = bx::simd_ld(ptr + 0); bx::simd128_t tmp1 = bx::simd_ld(ptr + 1); bx::simd128_t tmp2 = bx::simd_ld(ptr + 2); bx::simd128_t tmp3 = bx::simd_ld(ptr + 3); bx::simd128_t rsqrt0 = simdRsqrtFn(tmp0); bx::simd128_t rsqrt1 = simdRsqrtFn(tmp1); bx::simd128_t rsqrt2 = simdRsqrtFn(tmp2); bx::simd128_t rsqrt3 = simdRsqrtFn(tmp3); ptr = &_dst[ii*4]; bx::simd_st(ptr + 0, rsqrt0); bx::simd_st(ptr + 1, rsqrt1); bx::simd_st(ptr + 2, rsqrt2); bx::simd_st(ptr + 3, rsqrt3); } } void simd_bench_pass(bx::simd128_t* _dst, bx::simd128_t* _src, uint32_t _numVertices) { const uint32_t numIterations = 10; { int64_t elapsed = 0; for (uint32_t test = 0; test < numIterations; ++test) { flushCache(); elapsed += -bx::getHPCounter(); simd_rsqrt_bench(_dst, _src, _numVertices); elapsed += bx::getHPCounter(); } printf(" simd_rsqrt_est: %15f\n", double(elapsed) ); } { int64_t elapsed = 0; for (uint32_t test = 0; test < numIterations; ++test) { flushCache(); elapsed += -bx::getHPCounter(); simd_rsqrt_bench(_dst, _src, _numVertices); elapsed += bx::getHPCounter(); } printf(" simd_rsqrt_nr: %15f\n", double(elapsed) ); } { int64_t elapsed = 0; for (uint32_t test = 0; test < numIterations; ++test) { flushCache(); elapsed += -bx::getHPCounter(); simd_rsqrt_bench(_dst, _src, _numVertices); elapsed += bx::getHPCounter(); } printf("simd_rsqrt_carmack: %15f\n", double(elapsed) ); } { int64_t elapsed = 0; for (uint32_t test = 0; test < numIterations; ++test) { flushCache(); elapsed += -bx::getHPCounter(); simd_rsqrt_bench(_dst, _src, _numVertices); elapsed += bx::getHPCounter(); } printf(" simd_rsqrt: %15f\n", double(elapsed) ); } } void simd_bench() { bx::DefaultAllocator allocator; bx::RngMwc rng; const uint32_t numVertices = 1024*1024; uint8_t* data = (uint8_t*)bx::alloc(&allocator, 2*numVertices*sizeof(bx::simd128_t), 16); bx::simd128_t* src = (bx::simd128_t*)data; bx::simd128_t* dst = &src[numVertices]; printf("\n -- positive & negative --\n"); for (uint32_t ii = 0; ii < numVertices; ++ii) { float* ptr = (float*)&src[ii]; bx::store(ptr, bx::randUnitSphere(&rng) ); ptr[3] = 1.0f; } simd_bench_pass(dst, src, numVertices); printf("\n -- positive only --\n"); for (uint32_t ii = 0; ii < numVertices; ++ii) { float* ptr = (float*)&src[ii]; ptr[0] = bx::abs(ptr[0]); ptr[1] = bx::abs(ptr[1]); ptr[2] = bx::abs(ptr[2]); ptr[3] = bx::abs(ptr[3]); } simd_bench_pass(dst, src, numVertices); bx::free(&allocator, data, 16); }