// // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-24817639 // Cuda compilation tools, release 10.0, V10.0.130 // Based on LLVM 3.4svn // .version 3.2 .target sm_20 .address_size 64 // .globl sum .const .align 4 .u32 my_constant = 314; .visible .entry sum( .param .u64 sum_param_0, .param .u64 sum_param_1, .param .u64 sum_param_2, .param .u32 sum_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<11>; ld.param.u64 %rd4, [sum_param_0]; ld.param.u64 %rd5, [sum_param_1]; ld.param.u64 %rd6, [sum_param_2]; ld.param.u32 %r6, [sum_param_3]; mov.u32 %r1, %ntid.x; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r10, %r1, %r7, %r8; setp.ge.s32 %p1, %r10, %r6; @%p1 bra BB0_3; cvta.to.global.u64 %rd1, %rd6; cvta.to.global.u64 %rd2, %rd5; cvta.to.global.u64 %rd3, %rd4; mov.u32 %r9, %nctaid.x; mul.lo.s32 %r3, %r9, %r1; BB0_2: mul.wide.s32 %rd7, %r10, 4; add.s64 %rd8, %rd3, %rd7; add.s64 %rd9, %rd2, %rd7; ld.global.f32 %f1, [%rd9]; ld.global.f32 %f2, [%rd8]; add.f32 %f3, %f2, %f1; add.s64 %rd10, %rd1, %rd7; st.global.f32 [%rd10], %f3; add.s32 %r10, %r3, %r10; setp.lt.s32 %p2, %r10, %r6; @%p2 bra BB0_2; BB0_3: ret; }