// // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-33281558 // Cuda compilation tools, release 12.3, V12.3.52 // Based on NVVM 7.0.1 // .version 8.3 .target sm_89 .address_size 64 // .globl _Z4gemmPfS_S_mmm .visible .entry _Z4gemmPfS_S_mmm( .param .u64 _Z4gemmPfS_S_mmm_param_0, .param .u64 _Z4gemmPfS_S_mmm_param_1, .param .u64 _Z4gemmPfS_S_mmm_param_2, .param .u64 _Z4gemmPfS_S_mmm_param_3, .param .u64 _Z4gemmPfS_S_mmm_param_4, .param .u64 _Z4gemmPfS_S_mmm_param_5 ) { .reg .pred %p<9>; .reg .f32 %f<30>; .reg .b32 %r<9>; .reg .b64 %rd<61>; ld.param.u64 %rd31, [_Z4gemmPfS_S_mmm_param_0]; ld.param.u64 %rd32, [_Z4gemmPfS_S_mmm_param_1]; ld.param.u64 %rd28, [_Z4gemmPfS_S_mmm_param_2]; ld.param.u64 %rd33, [_Z4gemmPfS_S_mmm_param_3]; ld.param.u64 %rd29, [_Z4gemmPfS_S_mmm_param_4]; ld.param.u64 %rd30, [_Z4gemmPfS_S_mmm_param_5]; cvta.to.global.u64 %rd1, %rd32; cvta.to.global.u64 %rd2, %rd31; mov.u32 %r1, %ntid.x; mov.u32 %r2, %ctaid.x; mov.u32 %r3, %tid.x; mad.lo.s32 %r4, %r2, %r1, %r3; cvt.u64.u32 %rd3, %r4; mov.u32 %r5, %ntid.y; mov.u32 %r6, %ctaid.y; mov.u32 %r7, %tid.y; mad.lo.s32 %r8, %r6, %r5, %r7; cvt.u64.u32 %rd4, %r8; setp.ge.u64 %p1, %rd3, %rd33; setp.ge.u64 %p2, %rd4, %rd30; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_9; setp.eq.s64 %p4, %rd29, 0; mov.f32 %f29, 0f00000000; @%p4 bra $L__BB0_8; mul.lo.s64 %rd5, %rd3, %rd29; and.b64 %rd6, %rd29, 3; add.s64 %rd35, %rd29, -1; setp.lt.u64 %p5, %rd35, 3; mov.f32 %f29, 0f00000000; mov.u64 %rd57, 0; @%p5 bra $L__BB0_5; sub.s64 %rd7, %rd6, %rd29; shl.b64 %rd37, %rd4, 2; add.s64 %rd55, %rd1, %rd37; shl.b64 %rd38, %rd5, 2; add.s64 %rd39, %rd2, %rd38; add.s64 %rd54, %rd39, 8; shl.b64 %rd10, %rd30, 2; mov.f32 %f29, 0f00000000; mov.u64 %rd57, 0; $L__BB0_4: ld.global.f32 %f12, [%rd55]; ld.global.f32 %f13, [%rd54+-8]; fma.rn.f32 %f14, %f13, %f12, %f29; add.s64 %rd40, %rd55, %rd10; ld.global.f32 %f15, [%rd40]; ld.global.f32 %f16, [%rd54+-4]; fma.rn.f32 %f17, %f16, %f15, %f14; add.s64 %rd41, %rd40, %rd10; ld.global.f32 %f18, [%rd41]; ld.global.f32 %f19, [%rd54]; fma.rn.f32 %f20, %f19, %f18, %f17; add.s64 %rd42, %rd41, %rd10; add.s64 %rd55, %rd42, %rd10; ld.global.f32 %f21, [%rd42]; ld.global.f32 %f22, [%rd54+4]; fma.rn.f32 %f29, %f22, %f21, %f20; add.s64 %rd57, %rd57, 4; add.s64 %rd43, %rd7, %rd57; add.s64 %rd54, %rd54, 16; setp.ne.s64 %p6, %rd43, 0; @%p6 bra $L__BB0_4; $L__BB0_5: setp.eq.s64 %p7, %rd6, 0; @%p7 bra $L__BB0_8; mul.lo.s64 %rd44, %rd57, %rd30; add.s64 %rd45, %rd44, %rd4; shl.b64 %rd46, %rd45, 2; add.s64 %rd60, %rd1, %rd46; shl.b64 %rd19, %rd30, 2; add.s64 %rd47, %rd57, %rd5; shl.b64 %rd48, %rd47, 2; add.s64 %rd59, %rd2, %rd48; neg.s64 %rd58, %rd6; $L__BB0_7: .pragma "nounroll"; ld.global.f32 %f23, [%rd60]; ld.global.f32 %f24, [%rd59]; fma.rn.f32 %f29, %f24, %f23, %f29; add.s64 %rd60, %rd60, %rd19; add.s64 %rd59, %rd59, 4; add.s64 %rd58, %rd58, 1; setp.ne.s64 %p8, %rd58, 0; @%p8 bra $L__BB0_7; $L__BB0_8: mul.lo.s64 %rd49, %rd3, %rd30; add.s64 %rd50, %rd49, %rd4; cvta.to.global.u64 %rd51, %rd28; shl.b64 %rd52, %rd50, 2; add.s64 %rd53, %rd51, %rd52; st.global.f32 [%rd53], %f29; $L__BB0_9: ret; }