// // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-33281558 // Cuda compilation tools, release 12.3, V12.3.52 // Based on NVVM 7.0.1 // .version 8.3 .target sm_89 .address_size 64 // .globl _Z9transposePfS_m // _ZZ9transposePfS_mE12sharedMemory has been demoted .visible .entry _Z9transposePfS_m( .param .u64 _Z9transposePfS_m_param_0, .param .u64 _Z9transposePfS_m_param_1, .param .u64 _Z9transposePfS_m_param_2 ) { .reg .pred %p<7>; .reg .f32 %f<3>; .reg .b32 %r<26>; .reg .b64 %rd<14>; // demoted variable .shared .align 4 .b8 _ZZ9transposePfS_mE12sharedMemory[4096]; ld.param.u64 %rd1, [_Z9transposePfS_m_param_0]; ld.param.u64 %rd2, [_Z9transposePfS_m_param_1]; ld.param.u64 %rd3, [_Z9transposePfS_m_param_2]; mov.u32 %r8, %ntid.x; mov.u32 %r9, %ctaid.x; mov.u32 %r1, %tid.x; mad.lo.s32 %r2, %r9, %r8, %r1; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r3, %tid.y; mad.lo.s32 %r4, %r11, %r10, %r3; mad.lo.s32 %r5, %r11, %r8, %r1; mad.lo.s32 %r6, %r9, %r10, %r3; cvt.s64.s32 %rd4, %r2; setp.lt.u64 %p1, %rd4, %rd3; cvt.s64.s32 %rd5, %r4; setp.lt.u64 %p2, %rd5, %rd3; and.pred %p3, %p1, %p2; shl.b32 %r12, %r1, 7; mov.u32 %r13, _ZZ9transposePfS_mE12sharedMemory; add.s32 %r14, %r13, %r12; shl.b32 %r15, %r3, 2; add.s32 %r7, %r14, %r15; @%p3 bra $L__BB0_2; bra.uni $L__BB0_1; $L__BB0_2: cvta.to.global.u64 %rd6, %rd1; cvt.u32.u64 %r17, %rd3; mad.lo.s32 %r18, %r4, %r17, %r2; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd6, %rd7; ld.global.f32 %f1, [%rd8]; st.shared.f32 [%r7], %f1; bra.uni $L__BB0_3; $L__BB0_1: mov.u32 %r16, 0; st.shared.u32 [%r7], %r16; $L__BB0_3: bar.sync 0; cvt.s64.s32 %rd9, %r5; setp.ge.u64 %p4, %rd9, %rd3; cvt.s64.s32 %rd10, %r6; setp.ge.u64 %p5, %rd10, %rd3; or.pred %p6, %p4, %p5; @%p6 bra $L__BB0_5; cvt.u32.u64 %r19, %rd3; mad.lo.s32 %r20, %r6, %r19, %r5; shl.b32 %r21, %r3, 7; add.s32 %r23, %r13, %r21; shl.b32 %r24, %r1, 2; add.s32 %r25, %r23, %r24; ld.shared.f32 %f2, [%r25]; cvta.to.global.u64 %rd11, %rd2; mul.wide.s32 %rd12, %r20, 4; add.s64 %rd13, %rd11, %rd12; st.global.f32 [%rd13], %f2; $L__BB0_5: ret; }