// // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-30411180 // Cuda compilation tools, release 11.5, V11.5.50 // Based on NVVM 7.0.1 // .version 7.5 .target sm_70 .address_size 64 // .globl g2p2g .func _ZN4core6result13unwrap_failed17ha84c04ab95b75c50E () .noreturn ; // _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E has been demoted // _ZN16sparkl2d_kernels4cuda10prefix_sum14prefix_sum_51212shared_array6SHARED17h1f773b50ba2bbc70E has been demoted .global .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162}; .visible .entry g2p2g( .param .f32 g2p2g_param_0, .param .u64 g2p2g_param_1, .param .u64 g2p2g_param_2, .param .u64 g2p2g_param_3, .param .u64 g2p2g_param_4, .param .u64 g2p2g_param_5, .param .u64 g2p2g_param_6, .param .u64 g2p2g_param_7, .param .u64 g2p2g_param_8, .param .u64 g2p2g_param_9, .param .align 8 .b8 g2p2g_param_10[72], .param .align 8 .b8 g2p2g_param_11[72], .param .u32 g2p2g_param_12, .param .u8 g2p2g_param_13 ) { .local .align 16 .b8 __local_depot0[112]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<839>; .reg .b16 %rs<36>; .reg .f32 %f<5639>; .reg .b32 %r<1795>; .reg .f64 %fd<27>; .reg .b64 %rd<1395>; // demoted variable .shared .align 8 .b8 _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E[4096]; mov.u64 %SPL, __local_depot0; cvta.local.u64 %SP, %SPL; ld.param.f32 %f980, [g2p2g_param_0]; ld.param.u64 %rd335, [g2p2g_param_3]; ld.param.u64 %rd336, [g2p2g_param_4]; ld.param.u64 %rd337, [g2p2g_param_5]; ld.param.u64 %rd338, [g2p2g_param_6]; ld.param.u64 %rd339, [g2p2g_param_7]; ld.param.u64 %rd340, [g2p2g_param_8]; ld.param.u8 %r315, [g2p2g_param_13]; ld.param.u8 %r316, [g2p2g_param_13+1]; prmt.b32 %r317, %r316, %r315, 30212; and.b32 %r318, %r317, 1; setp.eq.b32 %p24, %r318, 1; ld.param.u64 %rd355, [g2p2g_param_11+64]; ld.param.u64 %rd354, [g2p2g_param_11+56]; ld.param.u64 %rd353, [g2p2g_param_11+48]; ld.param.u64 %rd352, [g2p2g_param_11+32]; ld.param.u64 %rd350, [g2p2g_param_11+16]; ld.param.u64 %rd349, [g2p2g_param_11+8]; ld.param.f32 %f982, [g2p2g_param_11]; ld.param.u64 %rd348, [g2p2g_param_10+64]; ld.param.u32 %r312, [g2p2g_param_10+40]; ld.param.u64 %rd345, [g2p2g_param_10+32]; ld.param.u64 %rd342, [g2p2g_param_10+8]; cvta.to.global.u64 %rd2, %rd342; cvta.to.global.u64 %rd3, %rd345; cvta.to.global.u64 %rd5, %rd352; mov.u32 %r1, %tid.x; mov.u32 %r2, %ntid.x; setp.eq.s32 %p25, %r2, 0; @%p25 bra $L__BB0_653; mov.u32 %r319, %ctaid.x; selp.b64 %rd356, %rd354, %rd353, %p24; cvta.to.global.u64 %rd357, %rd356; mul.wide.u32 %rd358, %r319, 8; add.s64 %rd6, %rd357, %rd358; mov.u32 %r320, 64; div.u32 %r3, %r320, %r2; cvt.u64.u32 %rd7, %r3; mul.wide.u32 %rd8, %r3, %r1; setp.gt.u64 %p26, %rd8, 127; @%p26 bra $L__BB0_652; ld.global.u32 %r4, [%rd6+4]; ld.global.u32 %r321, [%rd6]; cvta.to.global.u64 %rd359, %rd350; mul.wide.u32 %rd360, %r321, 24; add.s64 %rd361, %rd359, %rd360; ld.global.u64 %rd362, [%rd361]; ld.global.v2.u32 {%r322, %r323}, [%rd361+8]; bfe.u64 %rd9, %rd8, 4, 1; bfe.u64 %rd10, %rd8, 5, 1; add.s64 %rd363, %rd9, %rd362; and.b64 %rd364, %rd363, 4294967295; shl.b64 %rd365, %rd8, 27; and.b64 %rd366, %rd365, 4294967296; add.s64 %rd367, %rd366, %rd362; and.b64 %rd368, %rd367, -4294967296; or.b64 %rd11, %rd368, %rd364; shr.u64 %rd369, %rd11, 16; xor.b64 %rd370, %rd369, %rd11; mul.lo.s64 %rd371, %rd370, 2246822507; shr.u64 %rd372, %rd371, 13; xor.b64 %rd373, %rd372, %rd371; mul.lo.s64 %rd374, %rd373, 3266489909; shr.u64 %rd375, %rd374, 16; xor.b64 %rd12, %rd375, %rd374; cvt.u64.u32 %rd376, %r312; add.s64 %rd13, %rd376, -1; and.b64 %rd1302, %rd12, %rd13; shl.b64 %rd377, %rd1302, 4; add.s64 %rd378, %rd3, %rd377; ld.global.u64 %rd15, [%rd378]; setp.eq.s64 %p27, %rd15, %rd11; @%p27 bra $L__BB0_16; bra.uni $L__BB0_3; $L__BB0_16: setp.gt.u32 %p38, %r2, 64; @%p38 bra $L__BB0_31; bra.uni $L__BB0_17; $L__BB0_3: setp.eq.s64 %p28, %rd15, -1; @%p28 bra $L__BB0_9; $L__BB0_5: add.s64 %rd379, %rd1302, 1; and.b64 %rd1302, %rd379, %rd13; shl.b64 %rd380, %rd1302, 4; add.s64 %rd381, %rd3, %rd380; ld.global.u64 %rd18, [%rd381]; setp.eq.s64 %p29, %rd18, %rd11; @%p29 bra $L__BB0_8; setp.ne.s64 %p30, %rd18, -1; @%p30 bra $L__BB0_5; setp.lt.u32 %p31, %r2, 65; @%p31 bra $L__BB0_10; bra.uni $L__BB0_31; $L__BB0_9: setp.gt.u32 %p33, %r2, 64; @%p33 bra $L__BB0_31; $L__BB0_10: and.b64 %rd19, %rd8, 15; add.s64 %rd20, %rd19, %rd7; shl.b64 %rd21, %rd9, 2; shl.b64 %rd22, %rd10, 2; add.s64 %rd382, %rd19, 1; max.u64 %rd23, %rd382, %rd20; sub.s64 %rd383, %rd23, %rd8; and.b64 %rd1304, %rd383, 3; setp.eq.s64 %p34, %rd1304, 0; mov.u64 %rd1310, %rd19; @%p34 bra $L__BB0_13; mov.f32 %f983, 0f00000000; mov.u32 %r324, -1; mov.u64 %rd1303, %rd19; $L__BB0_12: .pragma "nounroll"; add.s64 %rd1310, %rd1303, 1; bfe.u64 %rd384, %rd1303, 2, 2; and.b64 %rd385, %rd1303, 3; or.b64 %rd386, %rd385, %rd21; or.b64 %rd387, %rd384, %rd22; shl.b64 %rd388, %rd387, 3; or.b64 %rd389, %rd386, %rd388; shl.b64 %rd390, %rd389, 6; mov.u64 %rd391, _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E; mov.u64 %rd392, 0; add.s64 %rd393, %rd391, %rd390; st.shared.u64 [%rd393+32], %rd392; st.shared.v2.f32 [%rd393+40], {%f983, %f983}; st.shared.v2.f32 [%rd393+24], {%f983, %f983}; st.shared.v2.f32 [%rd393+16], {%f983, %f983}; st.shared.u32 [%rd393+56], %rd392; st.shared.u64 [%rd393+48], %rd392; st.shared.u64 [%rd393], %rd392; st.shared.u32 [%rd393+60], %r324; add.s64 %rd1304, %rd1304, -1; setp.ne.s64 %p35, %rd1304, 0; mov.u64 %rd1303, %rd1310; @%p35 bra $L__BB0_12; $L__BB0_13: not.b64 %rd394, %rd19; add.s64 %rd395, %rd23, %rd394; setp.lt.u64 %p36, %rd395, 3; @%p36 bra $L__BB0_31; add.s64 %rd396, %rd1310, 3; and.b64 %rd397, %rd396, 3; add.s64 %rd398, %rd1310, 1; and.b64 %rd399, %rd398, 3; and.b64 %rd400, %rd1310, 3; or.b64 %rd30, %rd400, %rd21; or.b64 %rd31, %rd399, %rd21; or.b64 %rd32, %rd397, %rd21; shr.u64 %rd1309, %rd396, 2; add.s64 %rd401, %rd1310, 2; shr.u64 %rd1308, %rd401, 2; shr.u64 %rd1307, %rd1310, 2; shr.u64 %rd1306, %rd398, 2; mov.f32 %f984, 0f00000000; mov.u32 %r325, -1; $L__BB0_15: and.b64 %rd402, %rd1307, 3; or.b64 %rd403, %rd402, %rd22; shl.b64 %rd404, %rd403, 3; or.b64 %rd405, %rd30, %rd404; shl.b64 %rd406, %rd405, 6; mov.u64 %rd407, _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E; mov.u64 %rd408, 0; add.s64 %rd409, %rd407, %rd406; st.shared.u64 [%rd409+32], %rd408; st.shared.v2.f32 [%rd409+40], {%f984, %f984}; st.shared.v2.f32 [%rd409+24], {%f984, %f984}; st.shared.v2.f32 [%rd409+16], {%f984, %f984}; st.shared.u32 [%rd409+56], %rd408; st.shared.u64 [%rd409+48], %rd408; st.shared.u64 [%rd409], %rd408; st.shared.u32 [%rd409+60], %r325; and.b64 %rd410, %rd1306, 3; or.b64 %rd411, %rd410, %rd22; shl.b64 %rd412, %rd411, 3; or.b64 %rd413, %rd31, %rd412; shl.b64 %rd414, %rd413, 6; add.s64 %rd415, %rd407, %rd414; st.shared.u64 [%rd415+32], %rd408; st.shared.v2.f32 [%rd415+40], {%f984, %f984}; st.shared.v2.f32 [%rd415+24], {%f984, %f984}; st.shared.v2.f32 [%rd415+16], {%f984, %f984}; st.shared.u32 [%rd415+56], %rd408; st.shared.u64 [%rd415+48], %rd408; st.shared.u64 [%rd415], %rd408; st.shared.u32 [%rd415+60], %r325; and.b64 %rd416, %rd1308, 3; or.b64 %rd417, %rd416, %rd22; shl.b64 %rd418, %rd417, 3; or.b64 %rd419, %rd30, %rd418; shl.b64 %rd420, %rd419, 6; xor.b64 %rd421, %rd420, 128; add.s64 %rd422, %rd407, %rd421; st.shared.u64 [%rd422+32], %rd408; st.shared.v2.f32 [%rd422+40], {%f984, %f984}; st.shared.v2.f32 [%rd422+24], {%f984, %f984}; st.shared.v2.f32 [%rd422+16], {%f984, %f984}; st.shared.u32 [%rd422+56], %rd408; st.shared.u64 [%rd422+48], %rd408; st.shared.u64 [%rd422], %rd408; st.shared.u32 [%rd422+60], %r325; and.b64 %rd423, %rd1309, 3; or.b64 %rd424, %rd423, %rd22; shl.b64 %rd425, %rd424, 3; or.b64 %rd426, %rd32, %rd425; shl.b64 %rd427, %rd426, 6; add.s64 %rd428, %rd407, %rd427; st.shared.u64 [%rd428+32], %rd408; st.shared.v2.f32 [%rd428+40], {%f984, %f984}; st.shared.v2.f32 [%rd428+24], {%f984, %f984}; st.shared.v2.f32 [%rd428+16], {%f984, %f984}; st.shared.u64 [%rd428+48], %rd408; st.shared.u32 [%rd428+56], %rd408; st.shared.u64 [%rd428], %rd408; st.shared.u32 [%rd428+60], %r325; add.s64 %rd1309, %rd1309, 1; add.s64 %rd1308, %rd1308, 1; add.s64 %rd1307, %rd1307, 1; add.s64 %rd1306, %rd1306, 1; add.s64 %rd1310, %rd1310, 4; setp.lt.u64 %p37, %rd1310, %rd20; @%p37 bra $L__BB0_15; bra.uni $L__BB0_31; $L__BB0_8: setp.lt.u32 %p32, %r2, 65; @%p32 bra $L__BB0_17; bra.uni $L__BB0_31; $L__BB0_17: and.b64 %rd1313, %rd8, 15; add.s64 %rd50, %rd1313, %rd7; shl.b64 %rd429, %rd1302, 4; add.s64 %rd430, %rd3, %rd429; shl.b64 %rd51, %rd9, 2; shl.b64 %rd52, %rd10, 2; ld.global.u32 %r326, [%rd430+8]; mul.wide.u32 %rd53, %r326, 16; add.s64 %rd431, %rd1313, 1; max.u64 %rd432, %rd431, %rd50; sub.s64 %rd433, %rd432, %rd8; and.b64 %rd434, %rd433, 1; setp.eq.b64 %p39, %rd434, 1; mov.pred %p40, 0; xor.pred %p41, %p39, %p40; not.pred %p42, %p41; @%p42 bra $L__BB0_22; and.b64 %rd435, %rd8, 3; bfe.u64 %rd436, %rd8, 2, 2; or.b64 %rd437, %rd435, %rd51; or.b64 %rd438, %rd436, %rd52; shl.b64 %rd439, %rd438, 3; or.b64 %rd440, %rd437, %rd439; or.b64 %rd441, %rd435, %rd53; and.b64 %rd442, %rd8, 12; or.b64 %rd54, %rd441, %rd442; setp.gt.u64 %p43, %rd348, %rd54; shl.b64 %rd443, %rd440, 6; mov.u64 %rd444, _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E; add.s64 %rd55, %rd444, %rd443; @%p43 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: mul.lo.s64 %rd446, %rd54, 48; add.s64 %rd447, %rd2, %rd446; ld.global.u32 %rd448, [%rd447+4]; ld.global.u32 %rd449, [%rd447+8]; bfi.b64 %rd450, %rd449, %rd448, 32, 32; st.shared.u64 [%rd55+32], %rd450; ld.global.u32 %r328, [%rd447+12]; st.shared.u32 [%rd55+48], %r328; ld.global.u64 %rd451, [%rd447+40]; st.shared.u32 [%rd55+52], %rd451; shr.u64 %rd452, %rd451, 32; st.shared.u32 [%rd55+56], %rd452; ld.global.u64 %rd453, [%rd447+24]; ld.global.u64 %rd454, [%rd447+32]; st.shared.u64 [%rd55], %rd453; st.shared.u64 [%rd55+8], %rd454; ld.global.u32 %r329, [%rd447+20]; st.shared.u32 [%rd55+16], %r329; bra.uni $L__BB0_21; $L__BB0_19: mov.u64 %rd445, 0; st.shared.u64 [%rd55+32], %rd445; mov.u32 %r327, 0; st.shared.u32 [%rd55+56], %rd445; st.shared.u64 [%rd55+48], %rd445; st.shared.u64 [%rd55], %rd445; st.shared.u32 [%rd55+16], %r327; $L__BB0_21: mov.u32 %r330, 0; mov.u64 %rd455, 0; mov.f32 %f985, 0f00000000; st.shared.v2.f32 [%rd55+40], {%f985, %f985}; st.shared.u64 [%rd55+24], %rd455; st.shared.u32 [%rd55+20], %r330; mov.u32 %r331, -1; st.shared.u32 [%rd55+60], %r331; mov.u64 %rd1313, %rd431; $L__BB0_22: setp.ge.u64 %p44, %rd431, %rd50; @%p44 bra $L__BB0_31; mov.u64 %rd466, _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E; mov.u32 %r335, 0; mov.f32 %f986, 0f00000000; mov.u32 %r336, -1; $L__BB0_24: bfe.u64 %rd457, %rd1313, 2, 2; and.b64 %rd458, %rd1313, 3; or.b64 %rd459, %rd458, %rd51; or.b64 %rd460, %rd457, %rd52; shl.b64 %rd461, %rd460, 3; or.b64 %rd462, %rd459, %rd461; or.b64 %rd463, %rd458, %rd53; and.b64 %rd464, %rd1313, 12; or.b64 %rd59, %rd463, %rd464; setp.gt.u64 %p45, %rd348, %rd59; shl.b64 %rd465, %rd462, 6; add.s64 %rd60, %rd466, %rd465; @%p45 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: mul.lo.s64 %rd468, %rd59, 48; add.s64 %rd469, %rd2, %rd468; ld.global.u32 %rd470, [%rd469+4]; ld.global.u32 %rd471, [%rd469+8]; bfi.b64 %rd472, %rd471, %rd470, 32, 32; st.shared.u64 [%rd60+32], %rd472; ld.global.u32 %r333, [%rd469+12]; st.shared.u32 [%rd60+48], %r333; ld.global.u64 %rd473, [%rd469+40]; st.shared.u32 [%rd60+52], %rd473; shr.u64 %rd474, %rd473, 32; st.shared.u32 [%rd60+56], %rd474; ld.global.u64 %rd475, [%rd469+24]; ld.global.u64 %rd476, [%rd469+32]; st.shared.u64 [%rd60], %rd475; st.shared.u64 [%rd60+8], %rd476; ld.global.u32 %r334, [%rd469+20]; st.shared.u32 [%rd60+16], %r334; bra.uni $L__BB0_27; $L__BB0_25: mov.u64 %rd467, 0; st.shared.u64 [%rd60+32], %rd467; st.shared.u32 [%rd60+56], %rd467; st.shared.u64 [%rd60+48], %rd467; st.shared.u64 [%rd60], %rd467; st.shared.u32 [%rd60+16], %r335; $L__BB0_27: mov.u64 %rd477, 0; st.shared.v2.f32 [%rd60+40], {%f986, %f986}; st.shared.u64 [%rd60+24], %rd477; st.shared.u32 [%rd60+20], %r335; st.shared.u32 [%rd60+60], %r336; add.s64 %rd61, %rd1313, 2; add.s64 %rd478, %rd1313, 1; and.b64 %rd479, %rd478, 3; bfe.u64 %rd480, %rd478, 2, 2; or.b64 %rd481, %rd479, %rd51; or.b64 %rd482, %rd480, %rd52; shl.b64 %rd483, %rd482, 3; or.b64 %rd484, %rd481, %rd483; or.b64 %rd485, %rd479, %rd53; and.b64 %rd486, %rd478, 12; or.b64 %rd62, %rd485, %rd486; setp.gt.u64 %p46, %rd348, %rd62; shl.b64 %rd487, %rd484, 6; add.s64 %rd63, %rd466, %rd487; @%p46 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: mul.lo.s64 %rd490, %rd62, 48; add.s64 %rd491, %rd2, %rd490; ld.global.u32 %rd492, [%rd491+4]; ld.global.u32 %rd493, [%rd491+8]; bfi.b64 %rd494, %rd493, %rd492, 32, 32; st.shared.u64 [%rd63+32], %rd494; ld.global.u32 %r338, [%rd491+12]; st.shared.u32 [%rd63+48], %r338; ld.global.u64 %rd495, [%rd491+40]; st.shared.u32 [%rd63+52], %rd495; shr.u64 %rd496, %rd495, 32; st.shared.u32 [%rd63+56], %rd496; ld.global.u64 %rd497, [%rd491+24]; ld.global.u64 %rd498, [%rd491+32]; st.shared.u64 [%rd63], %rd497; st.shared.u64 [%rd63+8], %rd498; ld.global.u32 %r339, [%rd491+20]; st.shared.u32 [%rd63+16], %r339; bra.uni $L__BB0_30; $L__BB0_28: st.shared.u64 [%rd63+32], %rd477; st.shared.u32 [%rd63+56], %rd477; st.shared.u64 [%rd63+48], %rd477; st.shared.u64 [%rd63], %rd477; st.shared.u32 [%rd63+16], %r335; $L__BB0_30: st.shared.v2.f32 [%rd63+40], {%f986, %f986}; st.shared.u64 [%rd63+24], %rd477; st.shared.u32 [%rd63+20], %r335; st.shared.u32 [%rd63+60], %r336; setp.lt.u64 %p47, %rd61, %rd50; mov.u64 %rd1313, %rd61; @%p47 bra $L__BB0_24; $L__BB0_31: bar.sync 0; add.s32 %r342, %r323, %r322; add.s32 %r7, %r4, %r1; setp.ge.u32 %p48, %r7, %r342; @%p48 bra $L__BB0_628; cvta.to.global.u64 %rd500, %rd340; mul.wide.u32 %rd501, %r7, 4; add.s64 %rd502, %rd500, %rd501; ld.global.u32 %r8, [%rd502]; cvta.to.global.u64 %rd503, %rd335; mul.wide.u32 %rd504, %r8, 24; add.s64 %rd505, %rd503, %rd504; ld.global.v4.u8 {%rs7, %rs8, %rs9, %rs10}, [%rd505]; ld.global.u32 %rd506, [%rd505+4]; ld.global.u32 %rd507, [%rd505+8]; bfi.b64 %rd64, %rd507, %rd506, 32, 32; ld.global.u32 %r9, [%rd505+12]; ld.global.u64 %rd65, [%rd505+16]; cvta.to.global.u64 %rd508, %rd336; mul.wide.u32 %rd509, %r8, 8; add.s64 %rd510, %rd508, %rd509; ld.global.f32 %f2, [%rd510]; ld.global.f32 %f3, [%rd510+4]; cvta.to.global.u64 %rd511, %rd337; add.s64 %rd512, %rd511, %rd509; ld.global.u32 %rd513, [%rd512]; ld.global.u32 %rd514, [%rd512+4]; bfi.b64 %rd515, %rd514, %rd513, 32, 32; add.u64 %rd517, %SPL, 96; st.local.u64 [%rd517], %rd515; cvta.to.global.u64 %rd518, %rd338; mul.wide.u32 %rd519, %r8, 32; add.s64 %rd520, %rd518, %rd519; ld.global.f32 %f4, [%rd520]; ld.global.f32 %f5, [%rd520+4]; ld.global.f32 %f6, [%rd520+8]; ld.global.f32 %f7, [%rd520+12]; ld.global.f32 %f5531, [%rd520+16]; ld.global.f32 %f5530, [%rd520+20]; ld.global.f32 %f10, [%rd520+24]; add.u64 %rd522, %SPL, 80; st.local.v4.f32 [%rd522], {%f7, %f5531, %f5530, %f10}; ld.global.f32 %f5532, [%rd520+28]; cvta.to.global.u64 %rd523, %rd339; add.s64 %rd524, %rd523, %rd509; ld.global.u32 %r10, [%rd524]; ld.global.u32 %r1794, [%rd524+4]; mul.f32 %f991, %f982, %f982; mov.f32 %f992, 0f40800000; div.rn.f32 %f12, %f992, %f991; div.rn.f32 %f993, %f2, %f982; div.rn.f32 %f994, %f3, %f982; mov.b32 %r343, %f993; and.b32 %r344, %r343, -2147483648; or.b32 %r345, %r344, 1056964608; mov.b32 %f995, %r345; add.rz.f32 %f996, %f993, %f995; cvt.rzi.f32.f32 %f13, %f996; mov.b32 %r346, %f994; and.b32 %r347, %r346, -2147483648; or.b32 %r348, %r347, 1056964608; mov.b32 %f997, %r348; add.rz.f32 %f998, %f994, %f997; cvt.rzi.f32.f32 %f14, %f998; add.f32 %f999, %f13, 0fBF800000; add.f32 %f1000, %f14, 0fBF800000; mul.f32 %f1001, %f982, %f999; mul.f32 %f1002, %f982, %f1000; sub.f32 %f15, %f1001, %f2; sub.f32 %f16, %f1002, %f3; add.u64 %rd526, %SPL, 64; mov.u64 %rd527, 0; st.local.v2.u64 [%rd526], {%rd527, %rd527}; neg.f32 %f1003, %f15; div.rn.f32 %f17, %f1003, %f982; mov.f32 %f1004, 0f3FC00000; sub.f32 %f18, %f1004, %f17; mov.f32 %f1007, 0f40000000; abs.f32 %f21, %f18; setp.lt.f32 %p49, %f21, 0f00800000; mul.f32 %f1009, %f21, 0f4B800000; selp.f32 %f1010, %f1009, %f21, %p49; selp.f32 %f1011, 0fC3170000, 0fC2FE0000, %p49; mov.b32 %r349, %f1010; and.b32 %r350, %r349, 8388607; or.b32 %r351, %r350, 1065353216; mov.b32 %f1012, %r351; shr.u32 %r352, %r349, 23; cvt.rn.f32.u32 %f1013, %r352; add.f32 %f1014, %f1011, %f1013; setp.gt.f32 %p50, %f1012, 0f3FB504F3; mul.f32 %f1015, %f1012, 0f3F000000; add.f32 %f1016, %f1014, 0f3F800000; selp.f32 %f1017, %f1016, %f1014, %p50; selp.f32 %f1018, %f1015, %f1012, %p50; add.f32 %f1019, %f1018, 0fBF800000; add.f32 %f989, %f1018, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f988,%f989; // end inline asm add.f32 %f1020, %f1019, %f1019; mul.f32 %f1021, %f988, %f1020; mul.f32 %f1022, %f1021, %f1021; mov.f32 %f1023, 0f3C4CAF63; mov.f32 %f1024, 0f3B18F0FE; fma.rn.f32 %f1025, %f1024, %f1022, %f1023; mov.f32 %f1026, 0f3DAAAABD; fma.rn.f32 %f1027, %f1025, %f1022, %f1026; mul.rn.f32 %f1028, %f1027, %f1022; mul.rn.f32 %f1029, %f1028, %f1021; sub.f32 %f1030, %f1019, %f1021; add.f32 %f1031, %f1030, %f1030; neg.f32 %f1032, %f1021; fma.rn.f32 %f1033, %f1032, %f1019, %f1031; mul.rn.f32 %f1034, %f988, %f1033; add.f32 %f1035, %f1029, %f1021; sub.f32 %f1036, %f1021, %f1035; add.f32 %f1037, %f1029, %f1036; add.f32 %f1038, %f1034, %f1037; add.f32 %f1039, %f1035, %f1038; sub.f32 %f1040, %f1035, %f1039; add.f32 %f1041, %f1038, %f1040; mov.f32 %f1042, 0f3F317200; mul.rn.f32 %f1043, %f1017, %f1042; mov.f32 %f1044, 0f35BFBE8E; mul.rn.f32 %f1045, %f1017, %f1044; add.f32 %f1046, %f1043, %f1039; sub.f32 %f1047, %f1043, %f1046; add.f32 %f1048, %f1039, %f1047; add.f32 %f1049, %f1041, %f1048; add.f32 %f1050, %f1045, %f1049; add.f32 %f1051, %f1046, %f1050; sub.f32 %f1052, %f1046, %f1051; add.f32 %f1053, %f1050, %f1052; mul.rn.f32 %f1054, %f1007, %f1051; neg.f32 %f1055, %f1054; fma.rn.f32 %f1056, %f1007, %f1051, %f1055; fma.rn.f32 %f1057, %f1007, %f1053, %f1056; mov.f32 %f1058, 0f00000000; fma.rn.f32 %f1059, %f1058, %f1051, %f1057; add.rn.f32 %f1060, %f1054, %f1059; neg.f32 %f1061, %f1060; add.rn.f32 %f1062, %f1054, %f1061; add.rn.f32 %f1063, %f1062, %f1059; mov.b32 %r353, %f1060; setp.eq.s32 %p51, %r353, 1118925336; add.s32 %r354, %r353, -1; mov.b32 %f1064, %r354; add.f32 %f1065, %f1063, 0f37000000; selp.f32 %f22, %f1065, %f1063, %p51; selp.f32 %f1066, %f1064, %f1060, %p51; mov.f32 %f1067, 0f3FB8AA3B; mul.rn.f32 %f1068, %f1066, %f1067; cvt.rzi.f32.f32 %f1069, %f1068; abs.f32 %f1070, %f1069; setp.gt.f32 %p52, %f1070, 0f42FC0000; mov.b32 %r355, %f1069; and.b32 %r356, %r355, -2147483648; or.b32 %r357, %r356, 1123811328; mov.b32 %f1071, %r357; selp.f32 %f1072, %f1071, %f1069, %p52; mov.f32 %f1073, 0fBF317218; fma.rn.f32 %f1074, %f1072, %f1073, %f1066; mov.f32 %f1075, 0f3102E308; fma.rn.f32 %f1076, %f1072, %f1075, %f1074; mul.f32 %f1077, %f1076, 0f3FB8AA3B; add.f32 %f1078, %f1072, 0f4B40007F; mov.b32 %r358, %f1078; shl.b32 %r359, %r358, 23; mov.b32 %f1079, %r359; ex2.approx.ftz.f32 %f1080, %f1077; mul.f32 %f23, %f1080, %f1079; setp.eq.f32 %p53, %f23, 0f7F800000; mov.f32 %f5459, 0f7F800000; @%p53 bra $L__BB0_34; fma.rn.f32 %f5459, %f23, %f22, %f23; $L__BB0_34: mov.f32 %f5191, 0f3F800000; cvt.rzi.f32.f32 %f5190, %f5191; add.f32 %f5189, %f5190, %f5190; mov.f32 %f5188, 0f40000000; sub.f32 %f5187, %f5188, %f5189; abs.f32 %f5186, %f5187; setp.lt.f32 %p54, %f18, 0f00000000; setp.eq.f32 %p55, %f5186, 0f3F800000; and.pred %p1, %p54, %p55; setp.eq.f32 %p56, %f18, 0f00000000; @%p56 bra $L__BB0_38; bra.uni $L__BB0_35; $L__BB0_38: add.f32 %f1085, %f18, %f18; selp.f32 %f5461, %f1085, 0f00000000, %p55; bra.uni $L__BB0_39; $L__BB0_35: mov.b32 %r360, %f5459; xor.b32 %r361, %r360, -2147483648; mov.b32 %f1081, %r361; selp.f32 %f5461, %f1081, %f5459, %p1; setp.geu.f32 %p57, %f18, 0f00000000; @%p57 bra $L__BB0_39; cvt.rzi.f32.f32 %f1083, %f1007; setp.eq.f32 %p58, %f1083, 0f40000000; @%p58 bra $L__BB0_39; mov.f32 %f5461, 0f7FFFFFFF; $L__BB0_39: add.f32 %f1086, %f21, 0f40000000; mov.b32 %r362, %f1086; setp.lt.s32 %p60, %r362, 2139095040; @%p60 bra $L__BB0_44; setp.gtu.f32 %p61, %f21, 0f7F800000; @%p61 bra $L__BB0_43; bra.uni $L__BB0_41; $L__BB0_43: add.f32 %f5461, %f18, 0f40000000; bra.uni $L__BB0_44; $L__BB0_41: setp.neu.f32 %p62, %f21, 0f7F800000; @%p62 bra $L__BB0_44; selp.f32 %f5461, 0fFF800000, 0f7F800000, %p1; $L__BB0_44: mov.f32 %f5415, 0f3102E308; mov.f32 %f5414, 0fBF317218; mov.f32 %f5413, 0f3FB8AA3B; mov.f32 %f5412, 0f35BFBE8E; mov.f32 %f5411, 0f3F317200; mov.f32 %f5410, 0f3DAAAABD; mov.f32 %f5409, 0f3C4CAF63; mov.f32 %f5408, 0f3B18F0FE; mul.f32 %f1090, %f5461, 0f3F000000; setp.eq.f32 %p63, %f18, 0f3F800000; selp.f32 %f32, 0f3F000000, %f1090, %p63; add.f32 %f33, %f17, 0fBF800000; abs.f32 %f34, %f33; setp.lt.f32 %p64, %f34, 0f00800000; mul.f32 %f1091, %f34, 0f4B800000; selp.f32 %f1092, %f1091, %f34, %p64; selp.f32 %f1093, 0fC3170000, 0fC2FE0000, %p64; mov.b32 %r363, %f1092; and.b32 %r364, %r363, 8388607; or.b32 %r365, %r364, 1065353216; mov.b32 %f1094, %r365; shr.u32 %r366, %r363, 23; cvt.rn.f32.u32 %f1095, %r366; add.f32 %f1096, %f1093, %f1095; setp.gt.f32 %p65, %f1094, 0f3FB504F3; mul.f32 %f1097, %f1094, 0f3F000000; add.f32 %f1098, %f1096, 0f3F800000; selp.f32 %f1099, %f1098, %f1096, %p65; selp.f32 %f1100, %f1097, %f1094, %p65; add.f32 %f1101, %f1100, 0fBF800000; add.f32 %f1088, %f1100, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f1087,%f1088; // end inline asm add.f32 %f1102, %f1101, %f1101; mul.f32 %f1104, %f1087, %f1102; mul.f32 %f1105, %f1104, %f1104; fma.rn.f32 %f1108, %f5408, %f1105, %f5409; fma.rn.f32 %f1110, %f1108, %f1105, %f5410; mul.rn.f32 %f1111, %f1110, %f1105; mul.rn.f32 %f1112, %f1111, %f1104; sub.f32 %f1113, %f1101, %f1104; add.f32 %f1114, %f1113, %f1113; neg.f32 %f1115, %f1104; fma.rn.f32 %f1116, %f1115, %f1101, %f1114; mul.rn.f32 %f1117, %f1087, %f1116; add.f32 %f1118, %f1112, %f1104; sub.f32 %f1119, %f1104, %f1118; add.f32 %f1120, %f1112, %f1119; add.f32 %f1121, %f1117, %f1120; add.f32 %f1122, %f1118, %f1121; sub.f32 %f1123, %f1118, %f1122; add.f32 %f1124, %f1121, %f1123; mul.rn.f32 %f1126, %f1099, %f5411; mul.rn.f32 %f1128, %f1099, %f5412; add.f32 %f1129, %f1126, %f1122; sub.f32 %f1130, %f1126, %f1129; add.f32 %f1131, %f1122, %f1130; add.f32 %f1132, %f1124, %f1131; add.f32 %f1133, %f1128, %f1132; add.f32 %f1134, %f1129, %f1133; sub.f32 %f1135, %f1129, %f1134; add.f32 %f1136, %f1133, %f1135; mul.rn.f32 %f1137, %f1007, %f1134; neg.f32 %f1138, %f1137; fma.rn.f32 %f1139, %f1007, %f1134, %f1138; fma.rn.f32 %f1140, %f1007, %f1136, %f1139; fma.rn.f32 %f1142, %f1058, %f1134, %f1140; add.rn.f32 %f1143, %f1137, %f1142; neg.f32 %f1144, %f1143; add.rn.f32 %f1145, %f1137, %f1144; add.rn.f32 %f1146, %f1145, %f1142; mov.b32 %r367, %f1143; setp.eq.s32 %p66, %r367, 1118925336; add.s32 %r368, %r367, -1; mov.b32 %f1147, %r368; add.f32 %f1148, %f1146, 0f37000000; selp.f32 %f35, %f1148, %f1146, %p66; selp.f32 %f1149, %f1147, %f1143, %p66; mul.rn.f32 %f1151, %f1149, %f5413; cvt.rzi.f32.f32 %f1152, %f1151; abs.f32 %f1153, %f1152; setp.gt.f32 %p67, %f1153, 0f42FC0000; mov.b32 %r369, %f1152; and.b32 %r370, %r369, -2147483648; or.b32 %r371, %r370, 1123811328; mov.b32 %f1154, %r371; selp.f32 %f1155, %f1154, %f1152, %p67; fma.rn.f32 %f1157, %f1155, %f5414, %f1149; fma.rn.f32 %f1159, %f1155, %f5415, %f1157; mul.f32 %f1160, %f1159, 0f3FB8AA3B; add.f32 %f1161, %f1155, 0f4B40007F; mov.b32 %r372, %f1161; shl.b32 %r373, %r372, 23; mov.b32 %f1162, %r373; ex2.approx.ftz.f32 %f1163, %f1160; mul.f32 %f36, %f1163, %f1162; setp.eq.f32 %p68, %f36, 0f7F800000; mov.f32 %f5462, 0f7F800000; @%p68 bra $L__BB0_46; fma.rn.f32 %f5462, %f36, %f35, %f36; $L__BB0_46: setp.lt.f32 %p69, %f33, 0f00000000; and.pred %p2, %p69, %p55; setp.eq.f32 %p71, %f33, 0f00000000; @%p71 bra $L__BB0_50; bra.uni $L__BB0_47; $L__BB0_50: add.f32 %f1168, %f33, %f33; selp.f32 %f5464, %f1168, 0f00000000, %p55; bra.uni $L__BB0_51; $L__BB0_47: mov.b32 %r374, %f5462; xor.b32 %r375, %r374, -2147483648; mov.b32 %f1164, %r375; selp.f32 %f5464, %f1164, %f5462, %p2; setp.geu.f32 %p72, %f33, 0f00000000; @%p72 bra $L__BB0_51; cvt.rzi.f32.f32 %f1166, %f1007; setp.eq.f32 %p73, %f1166, 0f40000000; @%p73 bra $L__BB0_51; mov.f32 %f5464, 0f7FFFFFFF; $L__BB0_51: add.f32 %f1169, %f34, 0f40000000; mov.b32 %r376, %f1169; setp.lt.s32 %p75, %r376, 2139095040; @%p75 bra $L__BB0_56; setp.gtu.f32 %p76, %f34, 0f7F800000; @%p76 bra $L__BB0_55; bra.uni $L__BB0_53; $L__BB0_55: add.f32 %f5464, %f33, 0f40000000; bra.uni $L__BB0_56; $L__BB0_53: setp.neu.f32 %p77, %f34, 0f7F800000; @%p77 bra $L__BB0_56; selp.f32 %f5464, 0fFF800000, 0f7F800000, %p2; $L__BB0_56: mov.f32 %f5423, 0f3102E308; mov.f32 %f5422, 0fBF317218; mov.f32 %f5421, 0f3FB8AA3B; mov.f32 %f5420, 0f35BFBE8E; mov.f32 %f5419, 0f3F317200; mov.f32 %f5418, 0f3DAAAABD; mov.f32 %f5417, 0f3C4CAF63; mov.f32 %f5416, 0f3B18F0FE; mov.f32 %f1173, 0f3F400000; sub.f32 %f1174, %f1173, %f5464; setp.eq.f32 %p78, %f33, 0f3F800000; selp.f32 %f45, 0fBE800000, %f1174, %p78; add.f32 %f46, %f17, 0fBF000000; abs.f32 %f47, %f46; setp.lt.f32 %p79, %f47, 0f00800000; mul.f32 %f1175, %f47, 0f4B800000; selp.f32 %f1176, %f1175, %f47, %p79; selp.f32 %f1177, 0fC3170000, 0fC2FE0000, %p79; mov.b32 %r377, %f1176; and.b32 %r378, %r377, 8388607; or.b32 %r379, %r378, 1065353216; mov.b32 %f1178, %r379; shr.u32 %r380, %r377, 23; cvt.rn.f32.u32 %f1179, %r380; add.f32 %f1180, %f1177, %f1179; setp.gt.f32 %p80, %f1178, 0f3FB504F3; mul.f32 %f1181, %f1178, 0f3F000000; add.f32 %f1182, %f1180, 0f3F800000; selp.f32 %f1183, %f1182, %f1180, %p80; selp.f32 %f1184, %f1181, %f1178, %p80; add.f32 %f1185, %f1184, 0fBF800000; add.f32 %f1171, %f1184, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f1170,%f1171; // end inline asm add.f32 %f1186, %f1185, %f1185; mul.f32 %f1188, %f1170, %f1186; mul.f32 %f1189, %f1188, %f1188; fma.rn.f32 %f1192, %f5416, %f1189, %f5417; fma.rn.f32 %f1194, %f1192, %f1189, %f5418; mul.rn.f32 %f1195, %f1194, %f1189; mul.rn.f32 %f1196, %f1195, %f1188; sub.f32 %f1197, %f1185, %f1188; add.f32 %f1198, %f1197, %f1197; neg.f32 %f1199, %f1188; fma.rn.f32 %f1200, %f1199, %f1185, %f1198; mul.rn.f32 %f1201, %f1170, %f1200; add.f32 %f1202, %f1196, %f1188; sub.f32 %f1203, %f1188, %f1202; add.f32 %f1204, %f1196, %f1203; add.f32 %f1205, %f1201, %f1204; add.f32 %f1206, %f1202, %f1205; sub.f32 %f1207, %f1202, %f1206; add.f32 %f1208, %f1205, %f1207; mul.rn.f32 %f1210, %f1183, %f5419; mul.rn.f32 %f1212, %f1183, %f5420; add.f32 %f1213, %f1210, %f1206; sub.f32 %f1214, %f1210, %f1213; add.f32 %f1215, %f1206, %f1214; add.f32 %f1216, %f1208, %f1215; add.f32 %f1217, %f1212, %f1216; add.f32 %f1218, %f1213, %f1217; sub.f32 %f1219, %f1213, %f1218; add.f32 %f1220, %f1217, %f1219; mul.rn.f32 %f1221, %f1007, %f1218; neg.f32 %f1222, %f1221; fma.rn.f32 %f1223, %f1007, %f1218, %f1222; fma.rn.f32 %f1224, %f1007, %f1220, %f1223; fma.rn.f32 %f1226, %f1058, %f1218, %f1224; add.rn.f32 %f1227, %f1221, %f1226; neg.f32 %f1228, %f1227; add.rn.f32 %f1229, %f1221, %f1228; add.rn.f32 %f1230, %f1229, %f1226; mov.b32 %r381, %f1227; setp.eq.s32 %p81, %r381, 1118925336; add.s32 %r382, %r381, -1; mov.b32 %f1231, %r382; add.f32 %f1232, %f1230, 0f37000000; selp.f32 %f48, %f1232, %f1230, %p81; selp.f32 %f1233, %f1231, %f1227, %p81; mul.rn.f32 %f1235, %f1233, %f5421; cvt.rzi.f32.f32 %f1236, %f1235; abs.f32 %f1237, %f1236; setp.gt.f32 %p82, %f1237, 0f42FC0000; mov.b32 %r383, %f1236; and.b32 %r384, %r383, -2147483648; or.b32 %r385, %r384, 1123811328; mov.b32 %f1238, %r385; selp.f32 %f1239, %f1238, %f1236, %p82; fma.rn.f32 %f1241, %f1239, %f5422, %f1233; fma.rn.f32 %f1243, %f1239, %f5423, %f1241; mul.f32 %f1244, %f1243, 0f3FB8AA3B; add.f32 %f1245, %f1239, 0f4B40007F; mov.b32 %r386, %f1245; shl.b32 %r387, %r386, 23; mov.b32 %f1246, %r387; ex2.approx.ftz.f32 %f1247, %f1244; mul.f32 %f49, %f1247, %f1246; setp.eq.f32 %p83, %f49, 0f7F800000; mov.f32 %f5465, 0f7F800000; @%p83 bra $L__BB0_58; fma.rn.f32 %f5465, %f49, %f48, %f49; $L__BB0_58: setp.lt.f32 %p84, %f46, 0f00000000; and.pred %p3, %p84, %p55; setp.eq.f32 %p86, %f46, 0f00000000; @%p86 bra $L__BB0_62; bra.uni $L__BB0_59; $L__BB0_62: add.f32 %f1252, %f46, %f46; selp.f32 %f5467, %f1252, 0f00000000, %p55; bra.uni $L__BB0_63; $L__BB0_59: mov.b32 %r388, %f5465; xor.b32 %r389, %r388, -2147483648; mov.b32 %f1248, %r389; selp.f32 %f5467, %f1248, %f5465, %p3; setp.geu.f32 %p87, %f46, 0f00000000; @%p87 bra $L__BB0_63; cvt.rzi.f32.f32 %f1250, %f1007; setp.eq.f32 %p88, %f1250, 0f40000000; @%p88 bra $L__BB0_63; mov.f32 %f5467, 0f7FFFFFFF; $L__BB0_63: add.f32 %f1253, %f47, 0f40000000; mov.b32 %r390, %f1253; setp.lt.s32 %p90, %r390, 2139095040; @%p90 bra $L__BB0_68; setp.gtu.f32 %p91, %f47, 0f7F800000; @%p91 bra $L__BB0_67; bra.uni $L__BB0_65; $L__BB0_67: add.f32 %f5467, %f46, 0f40000000; bra.uni $L__BB0_68; $L__BB0_65: setp.neu.f32 %p92, %f47, 0f7F800000; @%p92 bra $L__BB0_68; selp.f32 %f5467, 0fFF800000, 0f7F800000, %p3; $L__BB0_68: mov.f32 %f5431, 0f3102E308; mov.f32 %f5430, 0fBF317218; mov.f32 %f5429, 0f3FB8AA3B; mov.f32 %f5428, 0f35BFBE8E; mov.f32 %f5427, 0f3F317200; mov.f32 %f5426, 0f3DAAAABD; mov.f32 %f5425, 0f3C4CAF63; mov.f32 %f5424, 0f3B18F0FE; mov.f32 %f5182, 0f3FC00000; mul.f32 %f1257, %f5467, 0f3F000000; setp.eq.f32 %p93, %f46, 0f3F800000; selp.f32 %f58, 0f3F000000, %f1257, %p93; neg.f32 %f1258, %f16; div.rn.f32 %f59, %f1258, %f982; sub.f32 %f60, %f5182, %f59; abs.f32 %f61, %f60; setp.lt.f32 %p94, %f61, 0f00800000; mul.f32 %f1260, %f61, 0f4B800000; selp.f32 %f1261, %f1260, %f61, %p94; selp.f32 %f1262, 0fC3170000, 0fC2FE0000, %p94; mov.b32 %r391, %f1261; and.b32 %r392, %r391, 8388607; or.b32 %r393, %r392, 1065353216; mov.b32 %f1263, %r393; shr.u32 %r394, %r391, 23; cvt.rn.f32.u32 %f1264, %r394; add.f32 %f1265, %f1262, %f1264; setp.gt.f32 %p95, %f1263, 0f3FB504F3; mul.f32 %f1266, %f1263, 0f3F000000; add.f32 %f1267, %f1265, 0f3F800000; selp.f32 %f1268, %f1267, %f1265, %p95; selp.f32 %f1269, %f1266, %f1263, %p95; add.f32 %f1270, %f1269, 0fBF800000; add.f32 %f1255, %f1269, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f1254,%f1255; // end inline asm add.f32 %f1271, %f1270, %f1270; mul.f32 %f1273, %f1254, %f1271; mul.f32 %f1274, %f1273, %f1273; fma.rn.f32 %f1277, %f5424, %f1274, %f5425; fma.rn.f32 %f1279, %f1277, %f1274, %f5426; mul.rn.f32 %f1280, %f1279, %f1274; mul.rn.f32 %f1281, %f1280, %f1273; sub.f32 %f1282, %f1270, %f1273; add.f32 %f1283, %f1282, %f1282; neg.f32 %f1284, %f1273; fma.rn.f32 %f1285, %f1284, %f1270, %f1283; mul.rn.f32 %f1286, %f1254, %f1285; add.f32 %f1287, %f1281, %f1273; sub.f32 %f1288, %f1273, %f1287; add.f32 %f1289, %f1281, %f1288; add.f32 %f1290, %f1286, %f1289; add.f32 %f1291, %f1287, %f1290; sub.f32 %f1292, %f1287, %f1291; add.f32 %f1293, %f1290, %f1292; mul.rn.f32 %f1295, %f1268, %f5427; mul.rn.f32 %f1297, %f1268, %f5428; add.f32 %f1298, %f1295, %f1291; sub.f32 %f1299, %f1295, %f1298; add.f32 %f1300, %f1291, %f1299; add.f32 %f1301, %f1293, %f1300; add.f32 %f1302, %f1297, %f1301; add.f32 %f1303, %f1298, %f1302; sub.f32 %f1304, %f1298, %f1303; add.f32 %f1305, %f1302, %f1304; mul.rn.f32 %f1306, %f1007, %f1303; neg.f32 %f1307, %f1306; fma.rn.f32 %f1308, %f1007, %f1303, %f1307; fma.rn.f32 %f1309, %f1007, %f1305, %f1308; fma.rn.f32 %f1311, %f1058, %f1303, %f1309; add.rn.f32 %f1312, %f1306, %f1311; neg.f32 %f1313, %f1312; add.rn.f32 %f1314, %f1306, %f1313; add.rn.f32 %f1315, %f1314, %f1311; mov.b32 %r395, %f1312; setp.eq.s32 %p96, %r395, 1118925336; add.s32 %r396, %r395, -1; mov.b32 %f1316, %r396; add.f32 %f1317, %f1315, 0f37000000; selp.f32 %f62, %f1317, %f1315, %p96; selp.f32 %f1318, %f1316, %f1312, %p96; mul.rn.f32 %f1320, %f1318, %f5429; cvt.rzi.f32.f32 %f1321, %f1320; abs.f32 %f1322, %f1321; setp.gt.f32 %p97, %f1322, 0f42FC0000; mov.b32 %r397, %f1321; and.b32 %r398, %r397, -2147483648; or.b32 %r399, %r398, 1123811328; mov.b32 %f1323, %r399; selp.f32 %f1324, %f1323, %f1321, %p97; fma.rn.f32 %f1326, %f1324, %f5430, %f1318; fma.rn.f32 %f1328, %f1324, %f5431, %f1326; mul.f32 %f1329, %f1328, 0f3FB8AA3B; add.f32 %f1330, %f1324, 0f4B40007F; mov.b32 %r400, %f1330; shl.b32 %r401, %r400, 23; mov.b32 %f1331, %r401; ex2.approx.ftz.f32 %f1332, %f1329; mul.f32 %f63, %f1332, %f1331; setp.eq.f32 %p98, %f63, 0f7F800000; mov.f32 %f5468, 0f7F800000; @%p98 bra $L__BB0_70; fma.rn.f32 %f5468, %f63, %f62, %f63; $L__BB0_70: setp.lt.f32 %p99, %f60, 0f00000000; and.pred %p4, %p99, %p55; setp.eq.f32 %p101, %f60, 0f00000000; @%p101 bra $L__BB0_74; bra.uni $L__BB0_71; $L__BB0_74: add.f32 %f1337, %f60, %f60; selp.f32 %f5470, %f1337, 0f00000000, %p55; bra.uni $L__BB0_75; $L__BB0_71: mov.b32 %r402, %f5468; xor.b32 %r403, %r402, -2147483648; mov.b32 %f1333, %r403; selp.f32 %f5470, %f1333, %f5468, %p4; setp.geu.f32 %p102, %f60, 0f00000000; @%p102 bra $L__BB0_75; cvt.rzi.f32.f32 %f1335, %f1007; setp.eq.f32 %p103, %f1335, 0f40000000; @%p103 bra $L__BB0_75; mov.f32 %f5470, 0f7FFFFFFF; $L__BB0_75: add.f32 %f1338, %f61, 0f40000000; mov.b32 %r404, %f1338; setp.lt.s32 %p105, %r404, 2139095040; @%p105 bra $L__BB0_80; setp.gtu.f32 %p106, %f61, 0f7F800000; @%p106 bra $L__BB0_79; bra.uni $L__BB0_77; $L__BB0_79: add.f32 %f5470, %f60, 0f40000000; bra.uni $L__BB0_80; $L__BB0_77: setp.neu.f32 %p107, %f61, 0f7F800000; @%p107 bra $L__BB0_80; selp.f32 %f5470, 0fFF800000, 0f7F800000, %p4; $L__BB0_80: mov.f32 %f5384, 0f3102E308; mov.f32 %f5383, 0fBF317218; mov.f32 %f5382, 0f3FB8AA3B; mov.f32 %f5381, 0f35BFBE8E; mov.f32 %f5380, 0f3F317200; mov.f32 %f5379, 0f3DAAAABD; mov.f32 %f5378, 0f3C4CAF63; mov.f32 %f5377, 0f3B18F0FE; mul.f32 %f1342, %f5470, 0f3F000000; setp.eq.f32 %p108, %f60, 0f3F800000; selp.f32 %f72, 0f3F000000, %f1342, %p108; add.f32 %f73, %f59, 0fBF800000; abs.f32 %f74, %f73; setp.lt.f32 %p109, %f74, 0f00800000; mul.f32 %f1343, %f74, 0f4B800000; selp.f32 %f1344, %f1343, %f74, %p109; selp.f32 %f1345, 0fC3170000, 0fC2FE0000, %p109; mov.b32 %r405, %f1344; and.b32 %r406, %r405, 8388607; or.b32 %r407, %r406, 1065353216; mov.b32 %f1346, %r407; shr.u32 %r408, %r405, 23; cvt.rn.f32.u32 %f1347, %r408; add.f32 %f1348, %f1345, %f1347; setp.gt.f32 %p110, %f1346, 0f3FB504F3; mul.f32 %f1349, %f1346, 0f3F000000; add.f32 %f1350, %f1348, 0f3F800000; selp.f32 %f1351, %f1350, %f1348, %p110; selp.f32 %f1352, %f1349, %f1346, %p110; add.f32 %f1353, %f1352, 0fBF800000; add.f32 %f1340, %f1352, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f1339,%f1340; // end inline asm add.f32 %f1354, %f1353, %f1353; mul.f32 %f1356, %f1339, %f1354; mul.f32 %f1357, %f1356, %f1356; fma.rn.f32 %f1360, %f5377, %f1357, %f5378; fma.rn.f32 %f1362, %f1360, %f1357, %f5379; mul.rn.f32 %f1363, %f1362, %f1357; mul.rn.f32 %f1364, %f1363, %f1356; sub.f32 %f1365, %f1353, %f1356; add.f32 %f1366, %f1365, %f1365; neg.f32 %f1367, %f1356; fma.rn.f32 %f1368, %f1367, %f1353, %f1366; mul.rn.f32 %f1369, %f1339, %f1368; add.f32 %f1370, %f1364, %f1356; sub.f32 %f1371, %f1356, %f1370; add.f32 %f1372, %f1364, %f1371; add.f32 %f1373, %f1369, %f1372; add.f32 %f1374, %f1370, %f1373; sub.f32 %f1375, %f1370, %f1374; add.f32 %f1376, %f1373, %f1375; mul.rn.f32 %f1378, %f1351, %f5380; mul.rn.f32 %f1380, %f1351, %f5381; add.f32 %f1381, %f1378, %f1374; sub.f32 %f1382, %f1378, %f1381; add.f32 %f1383, %f1374, %f1382; add.f32 %f1384, %f1376, %f1383; add.f32 %f1385, %f1380, %f1384; add.f32 %f1386, %f1381, %f1385; sub.f32 %f1387, %f1381, %f1386; add.f32 %f1388, %f1385, %f1387; mul.rn.f32 %f1389, %f1007, %f1386; neg.f32 %f1390, %f1389; fma.rn.f32 %f1391, %f1007, %f1386, %f1390; fma.rn.f32 %f1392, %f1007, %f1388, %f1391; fma.rn.f32 %f1394, %f1058, %f1386, %f1392; add.rn.f32 %f1395, %f1389, %f1394; neg.f32 %f1396, %f1395; add.rn.f32 %f1397, %f1389, %f1396; add.rn.f32 %f1398, %f1397, %f1394; mov.b32 %r409, %f1395; setp.eq.s32 %p111, %r409, 1118925336; add.s32 %r410, %r409, -1; mov.b32 %f1399, %r410; add.f32 %f1400, %f1398, 0f37000000; selp.f32 %f75, %f1400, %f1398, %p111; selp.f32 %f1401, %f1399, %f1395, %p111; mul.rn.f32 %f1403, %f1401, %f5382; cvt.rzi.f32.f32 %f1404, %f1403; abs.f32 %f1405, %f1404; setp.gt.f32 %p112, %f1405, 0f42FC0000; mov.b32 %r411, %f1404; and.b32 %r412, %r411, -2147483648; or.b32 %r413, %r412, 1123811328; mov.b32 %f1406, %r413; selp.f32 %f1407, %f1406, %f1404, %p112; fma.rn.f32 %f1409, %f1407, %f5383, %f1401; fma.rn.f32 %f1411, %f1407, %f5384, %f1409; mul.f32 %f1412, %f1411, 0f3FB8AA3B; add.f32 %f1413, %f1407, 0f4B40007F; mov.b32 %r414, %f1413; shl.b32 %r415, %r414, 23; mov.b32 %f1414, %r415; ex2.approx.ftz.f32 %f1415, %f1412; mul.f32 %f76, %f1415, %f1414; setp.eq.f32 %p113, %f76, 0f7F800000; mov.f32 %f5471, 0f7F800000; @%p113 bra $L__BB0_82; fma.rn.f32 %f5471, %f76, %f75, %f76; $L__BB0_82: setp.lt.f32 %p114, %f73, 0f00000000; and.pred %p5, %p114, %p55; setp.eq.f32 %p116, %f73, 0f00000000; @%p116 bra $L__BB0_86; bra.uni $L__BB0_83; $L__BB0_86: add.f32 %f1420, %f73, %f73; selp.f32 %f5473, %f1420, 0f00000000, %p55; bra.uni $L__BB0_87; $L__BB0_83: mov.b32 %r416, %f5471; xor.b32 %r417, %r416, -2147483648; mov.b32 %f1416, %r417; selp.f32 %f5473, %f1416, %f5471, %p5; setp.geu.f32 %p117, %f73, 0f00000000; @%p117 bra $L__BB0_87; cvt.rzi.f32.f32 %f1418, %f1007; setp.eq.f32 %p118, %f1418, 0f40000000; @%p118 bra $L__BB0_87; mov.f32 %f5473, 0f7FFFFFFF; $L__BB0_87: add.f32 %f1421, %f74, 0f40000000; mov.b32 %r418, %f1421; setp.lt.s32 %p120, %r418, 2139095040; @%p120 bra $L__BB0_92; setp.gtu.f32 %p121, %f74, 0f7F800000; @%p121 bra $L__BB0_91; bra.uni $L__BB0_89; $L__BB0_91: add.f32 %f5473, %f73, 0f40000000; bra.uni $L__BB0_92; $L__BB0_89: setp.neu.f32 %p122, %f74, 0f7F800000; @%p122 bra $L__BB0_92; selp.f32 %f5473, 0fFF800000, 0f7F800000, %p5; $L__BB0_92: mov.f32 %f5392, 0f3102E308; mov.f32 %f5391, 0fBF317218; mov.f32 %f5390, 0f3FB8AA3B; mov.f32 %f5389, 0f35BFBE8E; mov.f32 %f5388, 0f3F317200; mov.f32 %f5387, 0f3DAAAABD; mov.f32 %f5386, 0f3C4CAF63; mov.f32 %f5385, 0f3B18F0FE; mov.f32 %f5200, 0f3F400000; sub.f32 %f1426, %f5200, %f5473; setp.eq.f32 %p123, %f73, 0f3F800000; selp.f32 %f85, 0fBE800000, %f1426, %p123; add.f32 %f86, %f59, 0fBF000000; abs.f32 %f87, %f86; setp.lt.f32 %p124, %f87, 0f00800000; mul.f32 %f1427, %f87, 0f4B800000; selp.f32 %f1428, %f1427, %f87, %p124; selp.f32 %f1429, 0fC3170000, 0fC2FE0000, %p124; mov.b32 %r419, %f1428; and.b32 %r420, %r419, 8388607; or.b32 %r421, %r420, 1065353216; mov.b32 %f1430, %r421; shr.u32 %r422, %r419, 23; cvt.rn.f32.u32 %f1431, %r422; add.f32 %f1432, %f1429, %f1431; setp.gt.f32 %p125, %f1430, 0f3FB504F3; mul.f32 %f1433, %f1430, 0f3F000000; add.f32 %f1434, %f1432, 0f3F800000; selp.f32 %f1435, %f1434, %f1432, %p125; selp.f32 %f1436, %f1433, %f1430, %p125; add.f32 %f1437, %f1436, 0fBF800000; add.f32 %f1423, %f1436, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f1422,%f1423; // end inline asm add.f32 %f1438, %f1437, %f1437; mul.f32 %f1440, %f1422, %f1438; mul.f32 %f1441, %f1440, %f1440; fma.rn.f32 %f1444, %f5385, %f1441, %f5386; fma.rn.f32 %f1446, %f1444, %f1441, %f5387; mul.rn.f32 %f1447, %f1446, %f1441; mul.rn.f32 %f1448, %f1447, %f1440; sub.f32 %f1449, %f1437, %f1440; add.f32 %f1450, %f1449, %f1449; neg.f32 %f1451, %f1440; fma.rn.f32 %f1452, %f1451, %f1437, %f1450; mul.rn.f32 %f1453, %f1422, %f1452; add.f32 %f1454, %f1448, %f1440; sub.f32 %f1455, %f1440, %f1454; add.f32 %f1456, %f1448, %f1455; add.f32 %f1457, %f1453, %f1456; add.f32 %f1458, %f1454, %f1457; sub.f32 %f1459, %f1454, %f1458; add.f32 %f1460, %f1457, %f1459; mul.rn.f32 %f1462, %f1435, %f5388; mul.rn.f32 %f1464, %f1435, %f5389; add.f32 %f1465, %f1462, %f1458; sub.f32 %f1466, %f1462, %f1465; add.f32 %f1467, %f1458, %f1466; add.f32 %f1468, %f1460, %f1467; add.f32 %f1469, %f1464, %f1468; add.f32 %f1470, %f1465, %f1469; sub.f32 %f1471, %f1465, %f1470; add.f32 %f1472, %f1469, %f1471; mul.rn.f32 %f1473, %f1007, %f1470; neg.f32 %f1474, %f1473; fma.rn.f32 %f1475, %f1007, %f1470, %f1474; fma.rn.f32 %f1476, %f1007, %f1472, %f1475; fma.rn.f32 %f1478, %f1058, %f1470, %f1476; add.rn.f32 %f1479, %f1473, %f1478; neg.f32 %f1480, %f1479; add.rn.f32 %f1481, %f1473, %f1480; add.rn.f32 %f1482, %f1481, %f1478; mov.b32 %r423, %f1479; setp.eq.s32 %p126, %r423, 1118925336; add.s32 %r424, %r423, -1; mov.b32 %f1483, %r424; add.f32 %f1484, %f1482, 0f37000000; selp.f32 %f88, %f1484, %f1482, %p126; selp.f32 %f1485, %f1483, %f1479, %p126; mul.rn.f32 %f1487, %f1485, %f5390; cvt.rzi.f32.f32 %f1488, %f1487; abs.f32 %f1489, %f1488; setp.gt.f32 %p127, %f1489, 0f42FC0000; mov.b32 %r425, %f1488; and.b32 %r426, %r425, -2147483648; or.b32 %r427, %r426, 1123811328; mov.b32 %f1490, %r427; selp.f32 %f1491, %f1490, %f1488, %p127; fma.rn.f32 %f1493, %f1491, %f5391, %f1485; fma.rn.f32 %f1495, %f1491, %f5392, %f1493; mul.f32 %f1496, %f1495, 0f3FB8AA3B; add.f32 %f1497, %f1491, 0f4B40007F; mov.b32 %r428, %f1497; shl.b32 %r429, %r428, 23; mov.b32 %f1498, %r429; ex2.approx.ftz.f32 %f1499, %f1496; mul.f32 %f89, %f1499, %f1498; setp.eq.f32 %p128, %f89, 0f7F800000; mov.f32 %f5474, 0f7F800000; @%p128 bra $L__BB0_94; fma.rn.f32 %f5474, %f89, %f88, %f89; $L__BB0_94: setp.lt.f32 %p129, %f86, 0f00000000; and.pred %p6, %p129, %p55; setp.eq.f32 %p131, %f86, 0f00000000; @%p131 bra $L__BB0_98; bra.uni $L__BB0_95; $L__BB0_98: add.f32 %f1504, %f86, %f86; selp.f32 %f5476, %f1504, 0f00000000, %p55; bra.uni $L__BB0_99; $L__BB0_95: mov.b32 %r430, %f5474; xor.b32 %r431, %r430, -2147483648; mov.b32 %f1500, %r431; selp.f32 %f5476, %f1500, %f5474, %p6; setp.geu.f32 %p132, %f86, 0f00000000; @%p132 bra $L__BB0_99; cvt.rzi.f32.f32 %f1502, %f1007; setp.eq.f32 %p133, %f1502, 0f40000000; @%p133 bra $L__BB0_99; mov.f32 %f5476, 0f7FFFFFFF; $L__BB0_99: add.f32 %f1505, %f87, 0f40000000; mov.b32 %r432, %f1505; setp.lt.s32 %p135, %r432, 2139095040; @%p135 bra $L__BB0_104; setp.gtu.f32 %p136, %f87, 0f7F800000; @%p136 bra $L__BB0_103; bra.uni $L__BB0_101; $L__BB0_103: add.f32 %f5476, %f86, 0f40000000; bra.uni $L__BB0_104; $L__BB0_101: setp.neu.f32 %p137, %f87, 0f7F800000; @%p137 bra $L__BB0_104; selp.f32 %f5476, 0fFF800000, 0f7F800000, %p6; $L__BB0_104: add.f32 %f5185, %f13, 0fBF800000; mul.f32 %f5184, %f982, %f5185; sub.f32 %f5183, %f5184, %f2; add.u64 %rd1265, %SPL, 96; ld.param.u64 %rd1264, [g2p2g_param_9]; add.u64 %rd1262, %SPL, 64; mul.f32 %f1506, %f5476, 0f3F000000; setp.eq.f32 %p138, %f86, 0f3F800000; selp.f32 %f1507, 0f3F000000, %f1506, %p138; max.f32 %f1508, %f13, 0fCF000000; cvt.rzi.s32.f32 %r433, %f1508; add.s32 %r434, %r433, -2; setp.gt.f32 %p139, %f13, 0f4EFFFFFF; selp.b32 %r435, 2147483645, %r434, %p139; setp.num.f32 %p140, %f13, %f13; selp.b32 %r436, %r435, -2, %p140; cvt.rn.f32.s32 %f1509, %r436; mul.f32 %f1510, %f1509, 0f3E800000; cvt.rmi.f32.f32 %f1511, %f1510; setp.gt.f32 %p141, %f1511, 0f4EFFFFFF; max.f32 %f1512, %f1511, 0fCF000000; cvt.rzi.s32.f32 %r437, %f1512; setp.num.f32 %p142, %f1511, %f1511; shl.b32 %r438, %r437, 2; selp.b32 %r439, -4, %r438, %p141; selp.b32 %r440, %r439, 0, %p142; sub.s32 %r441, %r436, %r440; max.f32 %f1513, %f14, 0fCF000000; cvt.rzi.s32.f32 %r442, %f1513; add.s32 %r443, %r442, -2; setp.gt.f32 %p143, %f14, 0f4EFFFFFF; selp.b32 %r444, 2147483645, %r443, %p143; setp.num.f32 %p144, %f14, %f14; selp.b32 %r445, %r444, -2, %p144; cvt.rn.f32.s32 %f1514, %r445; mul.f32 %f1515, %f1514, 0f3E800000; cvt.rmi.f32.f32 %f1516, %f1515; setp.gt.f32 %p145, %f1516, 0f4EFFFFFF; max.f32 %f1517, %f1516, 0fCF000000; cvt.rzi.s32.f32 %r446, %f1517; setp.num.f32 %p146, %f1516, %f1516; shl.b32 %r447, %r446, 2; selp.b32 %r448, -4, %r447, %p145; selp.b32 %r449, %r448, 0, %p146; sub.s32 %r450, %r445, %r449; cvt.u64.u32 %rd528, %r450; cvt.u64.u32 %rd529, %r441; bfi.b64 %rd530, %rd528, %rd529, 32, 32; mul.wide.u32 %rd531, %r450, 8; and.b64 %rd532, %rd531, 4294967288; add.s64 %rd533, %rd530, %rd532; add.s64 %rd66, %rd533, 9; add.f32 %f98, %f982, %f982; add.f32 %f1518, %f5183, %f98; add.f32 %f1519, %f16, %f98; mul.f32 %f1520, %f58, %f1507; shl.b64 %rd534, %rd66, 6; and.b64 %rd535, %rd534, 274877906880; mov.u64 %rd536, _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E; add.s64 %rd537, %rd536, %rd535; ld.shared.u64 %rd538, [%rd537+1184]; cvt.u32.u64 %r451, %rd538; mov.b32 %f1521, %r451; shr.u64 %rd539, %rd538, 32; cvt.u32.u64 %r452, %rd539; mov.b32 %f1522, %r452; fma.rn.f32 %f1523, %f1520, %f1521, 0f00000000; fma.rn.f32 %f1524, %f1520, %f1522, 0f00000000; mul.f32 %f1525, %f12, %f1520; mul.f32 %f1526, %f1525, %f1521; mul.f32 %f1527, %f1525, %f1522; fma.rn.f32 %f1528, %f1518, %f1526, 0f00000000; fma.rn.f32 %f1529, %f1518, %f1527, 0f00000000; fma.rn.f32 %f1530, %f1519, %f1526, 0f00000000; fma.rn.f32 %f1531, %f1519, %f1527, 0f00000000; ld.shared.f32 %f1532, [%rd537+1188]; mul.f32 %f1533, %f1519, %f1532; fma.rn.f32 %f1534, %f1518, %f1521, %f1533; mul.f32 %f1535, %f1520, %f1534; fma.rn.f32 %f1536, %f12, %f1535, 0f00000000; mul.f32 %f99, %f982, 0f00000000; add.f32 %f1537, %f16, %f99; ld.shared.u64 %rd540, [%rd537+160]; cvt.u32.u64 %r453, %rd540; mov.b32 %f1538, %r453; mul.f32 %f1539, %f58, %f72; shr.u64 %rd541, %rd540, 32; cvt.u32.u64 %r454, %rd541; mov.b32 %f1540, %r454; fma.rn.f32 %f1541, %f1539, %f1538, %f1523; fma.rn.f32 %f1542, %f1539, %f1540, %f1524; mul.f32 %f1543, %f12, %f1539; mul.f32 %f1544, %f1543, %f1538; mul.f32 %f1545, %f1543, %f1540; fma.rn.f32 %f1546, %f1518, %f1544, %f1528; fma.rn.f32 %f1547, %f1518, %f1545, %f1529; fma.rn.f32 %f1548, %f1537, %f1544, %f1530; fma.rn.f32 %f1549, %f1537, %f1545, %f1531; ld.shared.f32 %f1550, [%rd537+164]; mul.f32 %f1551, %f1537, %f1550; fma.rn.f32 %f1552, %f1518, %f1538, %f1551; mul.f32 %f1553, %f1539, %f1552; fma.rn.f32 %f1554, %f12, %f1553, %f1536; ld.shared.u64 %rd542, [%rd537+672]; cvt.u32.u64 %r455, %rd542; mov.b32 %f1555, %r455; mul.f32 %f1556, %f58, %f85; shr.u64 %rd543, %rd542, 32; cvt.u32.u64 %r456, %rd543; mov.b32 %f1557, %r456; fma.rn.f32 %f1558, %f1556, %f1555, %f1541; fma.rn.f32 %f1559, %f1556, %f1557, %f1542; mul.f32 %f1560, %f12, %f1556; mul.f32 %f1561, %f1560, %f1555; mul.f32 %f1562, %f1560, %f1557; add.f32 %f1563, %f16, %f982; fma.rn.f32 %f1564, %f1518, %f1561, %f1546; fma.rn.f32 %f1565, %f1518, %f1562, %f1547; fma.rn.f32 %f1566, %f1563, %f1561, %f1548; fma.rn.f32 %f1567, %f1563, %f1562, %f1549; ld.shared.f32 %f1568, [%rd537+676]; mul.f32 %f1569, %f1563, %f1568; fma.rn.f32 %f1570, %f1518, %f1555, %f1569; mul.f32 %f1571, %f1556, %f1570; fma.rn.f32 %f1572, %f12, %f1571, %f1554; add.f32 %f1573, %f5183, %f99; mul.f32 %f1574, %f32, %f1507; ld.shared.u64 %rd544, [%rd537+1056]; cvt.u32.u64 %r457, %rd544; mov.b32 %f1575, %r457; shr.u64 %rd545, %rd544, 32; cvt.u32.u64 %r458, %rd545; mov.b32 %f1576, %r458; fma.rn.f32 %f1577, %f1574, %f1575, %f1558; fma.rn.f32 %f1578, %f1574, %f1576, %f1559; mul.f32 %f1579, %f12, %f1574; mul.f32 %f1580, %f1579, %f1575; mul.f32 %f1581, %f1579, %f1576; fma.rn.f32 %f1582, %f1573, %f1580, %f1564; fma.rn.f32 %f1583, %f1573, %f1581, %f1565; fma.rn.f32 %f1584, %f1519, %f1580, %f1566; fma.rn.f32 %f1585, %f1519, %f1581, %f1567; ld.shared.f32 %f1586, [%rd537+1060]; mul.f32 %f1587, %f1519, %f1586; fma.rn.f32 %f1588, %f1573, %f1575, %f1587; mul.f32 %f1589, %f1574, %f1588; fma.rn.f32 %f1590, %f12, %f1589, %f1572; ld.shared.u64 %rd546, [%rd537+32]; cvt.u32.u64 %r459, %rd546; mov.b32 %f1591, %r459; mul.f32 %f1592, %f32, %f72; shr.u64 %rd547, %rd546, 32; cvt.u32.u64 %r460, %rd547; mov.b32 %f1593, %r460; fma.rn.f32 %f1594, %f1592, %f1591, %f1577; fma.rn.f32 %f1595, %f1592, %f1593, %f1578; mul.f32 %f1596, %f12, %f1592; mul.f32 %f1597, %f1596, %f1591; mul.f32 %f1598, %f1596, %f1593; fma.rn.f32 %f1599, %f1573, %f1597, %f1582; fma.rn.f32 %f1600, %f1573, %f1598, %f1583; fma.rn.f32 %f1601, %f1537, %f1597, %f1584; fma.rn.f32 %f1602, %f1537, %f1598, %f1585; ld.shared.f32 %f1603, [%rd537+36]; mul.f32 %f1604, %f1537, %f1603; fma.rn.f32 %f1605, %f1573, %f1591, %f1604; mul.f32 %f1606, %f1592, %f1605; fma.rn.f32 %f1607, %f12, %f1606, %f1590; ld.shared.u64 %rd548, [%rd537+544]; cvt.u32.u64 %r461, %rd548; mov.b32 %f1608, %r461; mul.f32 %f1609, %f32, %f85; shr.u64 %rd549, %rd548, 32; cvt.u32.u64 %r462, %rd549; mov.b32 %f1610, %r462; fma.rn.f32 %f1611, %f1609, %f1608, %f1594; fma.rn.f32 %f1612, %f1609, %f1610, %f1595; mul.f32 %f1613, %f12, %f1609; mul.f32 %f1614, %f1613, %f1608; mul.f32 %f1615, %f1613, %f1610; fma.rn.f32 %f1616, %f1573, %f1614, %f1599; fma.rn.f32 %f1617, %f1573, %f1615, %f1600; fma.rn.f32 %f1618, %f1563, %f1614, %f1601; fma.rn.f32 %f1619, %f1563, %f1615, %f1602; ld.shared.f32 %f1620, [%rd537+548]; mul.f32 %f1621, %f1563, %f1620; fma.rn.f32 %f1622, %f1573, %f1608, %f1621; mul.f32 %f1623, %f1609, %f1622; fma.rn.f32 %f1624, %f12, %f1623, %f1607; mul.f32 %f1625, %f45, %f1507; ld.shared.u64 %rd550, [%rd537+1120]; cvt.u32.u64 %r463, %rd550; mov.b32 %f1626, %r463; shr.u64 %rd551, %rd550, 32; cvt.u32.u64 %r464, %rd551; mov.b32 %f1627, %r464; fma.rn.f32 %f1628, %f1625, %f1626, %f1611; fma.rn.f32 %f1629, %f1625, %f1627, %f1612; mul.f32 %f1630, %f12, %f1625; mul.f32 %f1631, %f1630, %f1626; mul.f32 %f1632, %f1630, %f1627; add.f32 %f1633, %f5183, %f982; fma.rn.f32 %f1634, %f1633, %f1631, %f1616; fma.rn.f32 %f1635, %f1633, %f1632, %f1617; fma.rn.f32 %f1636, %f1519, %f1631, %f1618; fma.rn.f32 %f1637, %f1519, %f1632, %f1619; ld.shared.f32 %f1638, [%rd537+1124]; mul.f32 %f1639, %f1519, %f1638; fma.rn.f32 %f1640, %f1633, %f1626, %f1639; mul.f32 %f1641, %f1625, %f1640; fma.rn.f32 %f1642, %f12, %f1641, %f1624; ld.shared.u64 %rd552, [%rd537+96]; cvt.u32.u64 %r465, %rd552; mov.b32 %f1643, %r465; mul.f32 %f1644, %f45, %f72; shr.u64 %rd553, %rd552, 32; cvt.u32.u64 %r466, %rd553; mov.b32 %f1645, %r466; fma.rn.f32 %f1646, %f1644, %f1643, %f1628; fma.rn.f32 %f1647, %f1644, %f1645, %f1629; mul.f32 %f1648, %f12, %f1644; mul.f32 %f1649, %f1648, %f1643; mul.f32 %f1650, %f1648, %f1645; fma.rn.f32 %f1651, %f1633, %f1649, %f1634; fma.rn.f32 %f1652, %f1633, %f1650, %f1635; fma.rn.f32 %f1653, %f1537, %f1649, %f1636; fma.rn.f32 %f1654, %f1537, %f1650, %f1637; ld.shared.f32 %f1655, [%rd537+100]; mul.f32 %f1656, %f1537, %f1655; fma.rn.f32 %f1657, %f1633, %f1643, %f1656; mul.f32 %f1658, %f1644, %f1657; fma.rn.f32 %f1659, %f12, %f1658, %f1642; ld.shared.u64 %rd554, [%rd537+608]; cvt.u32.u64 %r467, %rd554; mov.b32 %f1660, %r467; mul.f32 %f1661, %f45, %f85; shr.u64 %rd555, %rd554, 32; cvt.u32.u64 %r468, %rd555; mov.b32 %f1662, %r468; fma.rn.f32 %f5478, %f1661, %f1660, %f1646; fma.rn.f32 %f5477, %f1661, %f1662, %f1647; mul.f32 %f1663, %f12, %f1661; mul.f32 %f1664, %f1663, %f1660; mul.f32 %f1665, %f1663, %f1662; fma.rn.f32 %f5533, %f1633, %f1664, %f1651; fma.rn.f32 %f5534, %f1633, %f1665, %f1652; fma.rn.f32 %f5535, %f1563, %f1664, %f1653; fma.rn.f32 %f5536, %f1563, %f1665, %f1654; ld.shared.f32 %f1666, [%rd537+612]; mul.f32 %f1667, %f1563, %f1666; fma.rn.f32 %f1668, %f1633, %f1660, %f1667; mul.f32 %f1669, %f1661, %f1668; fma.rn.f32 %f106, %f12, %f1669, %f1659; st.local.v4.f32 [%rd1262], {%f5533, %f5534, %f5535, %f5536}; cvta.to.global.u64 %rd558, %rd1264; mul.lo.s64 %rd559, %rd65, 96; add.s64 %rd67, %rd558, %rd559; mov.b32 %r469, %f5478; mov.b32 %r470, %f5477; st.local.v2.u32 [%rd1265], {%r469, %r470}; ld.global.u32 %r12, [%rd67]; setp.eq.s16 %p147, %rs9, 0; mov.b64 %rd1323, {%r469, %r470}; @%p147 bra $L__BB0_106; add.u64 %rd1267, %SPL, 96; st.local.u64 [%rd1267], %rd64; cvt.u32.u64 %r471, %rd64; mov.b32 %f5478, %r471; shr.u64 %rd564, %rd64, 32; cvt.u32.u64 %r472, %rd564; mov.b32 %f5477, %r472; mov.u64 %rd1323, %rd64; $L__BB0_106: add.u64 %rd1271, %SP, 96; add.u64 %rd1317, %SP, 96; cvta.to.local.u64 %rd1315, %rd1317; add.s64 %rd1321, %rd1315, 8; mov.u64 %rd1322, 2; mov.u64 %rd1316, %rd1315; mov.u64 %rd1318, %rd1315; mov.u64 %rd1319, %rd1315; mov.u64 %rd1320, %rd1317; $L__BB0_107: setp.eq.s64 %p148, %rd1322, 0; @%p148 bra $L__BB0_110; add.s64 %rd1322, %rd1322, -1; add.s64 %rd566, %rd1315, 8; setp.eq.s64 %p149, %rd1318, %rd1321; selp.b64 %rd1315, %rd566, %rd1315, %p149; add.s64 %rd567, %rd1316, 8; selp.b64 %rd1316, %rd567, %rd1316, %p149; add.s64 %rd568, %rd1317, 8; selp.b64 %rd1317, %rd568, %rd1317, %p149; selp.b64 %rd569, %rd566, %rd1318, %p149; selp.b64 %rd570, %rd567, %rd1319, %p149; selp.b64 %rd571, %rd568, %rd1320, %p149; add.s64 %rd572, %rd1318, 8; selp.b64 %rd1321, %rd572, %rd1321, %p149; setp.eq.s64 %p150, %rd1322, 0; add.s64 %rd573, %rd569, 4; add.s64 %rd574, %rd570, 4; add.s64 %rd575, %rd571, 4; selp.b64 %rd1318, %rd569, %rd573, %p150; selp.b64 %rd1319, %rd570, %rd574, %p150; selp.b64 %rd1320, %rd571, %rd575, %p150; ld.local.f32 %f1670, [%rd570]; abs.f32 %f1671, %f1670; mul.f32 %f1672, %f1671, %f980; setp.ltu.f32 %p151, %f1672, %f982; @%p151 bra $L__BB0_107; setp.nan.f32 %p152, %f5478, %f5478; mov.b32 %r473, %f5478; setp.lt.s32 %p153, %r473, 0; selp.f32 %f1673, 0fBF800000, 0f3F800000, %p153; selp.f32 %f1674, 0f7FC00000, %f1673, %p152; mul.f32 %f1675, %f982, %f1674; mov.b32 %r474, %f5477; setp.lt.s32 %p154, %r474, 0; selp.f32 %f1676, 0fBF800000, 0f3F800000, %p154; setp.nan.f32 %p155, %f5477, %f5477; selp.f32 %f1677, 0f7FC00000, %f1676, %p155; mul.f32 %f1678, %f982, %f1677; div.rn.f32 %f1679, %f1678, %f980; mov.b32 %r475, %f1679; div.rn.f32 %f1680, %f1675, %f980; mov.b32 %r476, %f1680; add.u64 %rd577, %SPL, 96; st.local.v2.f32 [%rd577], {%f1680, %f1679}; mov.b64 %rd1323, {%r476, %r475}; $L__BB0_110: cvt.u32.u64 %r477, %rd1323; mov.b32 %f1681, %r477; shr.u64 %rd578, %rd1323, 32; cvt.u32.u64 %r478, %rd578; mov.b32 %f1682, %r478; fma.rn.f32 %f115, %f1681, %f980, %f2; fma.rn.f32 %f116, %f1682, %f980, %f3; setp.eq.s32 %p156, %r12, 2; @%p156 bra $L__BB0_112; bra.uni $L__BB0_111; $L__BB0_112: mul.f32 %f1696, %f106, %f980; fma.rn.f32 %f5481, %f1696, %f7, %f7; mov.u64 %rd1324, %rd522; bra.uni $L__BB0_113; $L__BB0_111: mul.f32 %f1683, %f5533, %f980; mul.f32 %f1684, %f5534, %f980; mul.f32 %f1685, %f5535, %f980; mul.f32 %f1686, %f1685, %f5531; fma.rn.f32 %f1687, %f1683, %f7, %f1686; mul.f32 %f1688, %f5536, %f980; mul.f32 %f1689, %f1688, %f5531; fma.rn.f32 %f1690, %f1684, %f7, %f1689; mul.f32 %f1691, %f1685, %f10; fma.rn.f32 %f1692, %f1683, %f5530, %f1691; mul.f32 %f1693, %f1688, %f10; fma.rn.f32 %f1694, %f1684, %f5530, %f1693; add.f32 %f5531, %f5531, %f1690; add.f32 %f1695, %f7, %f1687; st.local.v2.f32 [%rd522], {%f1695, %f5531}; add.f32 %f5530, %f1692, %f5530; st.local.f32 [%rd522+8], %f5530; add.f32 %f5481, %f1694, %f10; add.s64 %rd1324, %rd522, 12; $L__BB0_113: st.local.f32 [%rd1324], %f5481; ld.global.u32 %r13, [%rd67+32]; setp.eq.s32 %p157, %r13, 5; @%p157 bra $L__BB0_341; bra.uni $L__BB0_114; $L__BB0_341: setp.eq.s16 %p463, %rs8, 0; @%p463 bra $L__BB0_343; mov.f32 %f5533, 0f00000000; add.u64 %rd1231, %SPL, 64; add.u64 %rd839, %SPL, 96; st.local.v2.f32 [%rd839], {%f5533, %f5533}; st.local.v4.f32 [%rd1231], {%f5533, %f5533, %f5533, %f5533}; mov.f32 %f5534, %f5533; mov.f32 %f5535, %f5533; mov.f32 %f5536, %f5533; $L__BB0_343: add.u64 %rd1292, %SPL, 80; ld.local.f32 %f5635, [%rd1292+12]; ld.local.f32 %f5546, [%rd1292]; mul.f32 %f3214, %f5546, %f5635; mul.f32 %f438, %f5530, %f5531; sub.f32 %f439, %f3214, %f438; div.rn.f32 %f440, %f4, %f5; div.rn.f32 %f3215, %f440, %f439; setp.eq.f32 %p464, %f3215, 0f00000000; setp.ne.s16 %p465, %rs7, 0; or.pred %p466, %p465, %p464; @%p466 bra $L__BB0_626; bra.uni $L__BB0_344; $L__BB0_626: add.u64 %rd1298, %SPL, 80; add.u64 %rd1071, %SPL, 32; mov.u64 %rd1072, 0; st.local.v2.u64 [%rd1071], {%rd1072, %rd1072}; mov.u32 %r1687, 1065353216; st.local.u32 [%rd1071], %r1687; st.local.u32 [%rd1071+12], %r1687; ld.local.v2.u64 {%rd1073, %rd1074}, [%rd1071]; mov.b64 {%r1688, %r1689}, %rd1074; mov.b64 {%r1690, %r1691}, %rd1073; st.local.v2.u64 [%rd1298], {%rd1073, %rd1074}; mov.b32 %f5546, %r1690; mov.b32 %f5531, %r1691; mov.b32 %f5530, %r1688; mov.b32 %f5635, %r1689; mov.u16 %rs35, 1; bra.uni $L__BB0_627; $L__BB0_114: add.u64 %rd1360, %SP, 32; cvta.to.local.u64 %rd1328, %rd1360; add.s64 %rd94, %rd1328, 24; cvt.u16.u32 %rs12, %r13; setp.gt.s16 %p158, %rs12, 2; @%p158 bra $L__BB0_117; setp.eq.s16 %p161, %rs12, 1; @%p161 bra $L__BB0_190; setp.eq.s16 %p162, %rs12, 2; @%p162 bra $L__BB0_146; bra.uni $L__BB0_284; $L__BB0_146: ld.global.u64 %rd628, [%rd67+56]; mul.wide.u32 %rd629, %r8, 16; add.s64 %rd630, %rd628, %rd629; add.s64 %rd106, %rd630, 4; ld.global.f32 %f155, [%rd67+44]; ld.global.f32 %f156, [%rd67+40]; ld.local.v4.f32 {%f1884, %f1885, %f1886, %f1887}, [%rd522]; add.f32 %f1890, %f1887, %f1884; mul.f32 %f157, %f1890, 0f3F000000; sub.f32 %f1891, %f1884, %f1887; mul.f32 %f1892, %f1891, 0f3F000000; add.f32 %f1895, %f1885, %f1886; mul.f32 %f1896, %f1895, 0f3F000000; sub.f32 %f1897, %f1885, %f1886; mul.f32 %f158, %f1897, 0f3F000000; mul.f32 %f1898, %f158, %f158; fma.rn.f32 %f1899, %f157, %f157, %f1898; sqrt.rn.f32 %f1900, %f1899; mul.f32 %f1901, %f1896, %f1896; fma.rn.f32 %f1902, %f1892, %f1892, %f1901; sqrt.rn.f32 %f1903, %f1902; add.f32 %f159, %f1900, %f1903; sub.f32 %f160, %f1900, %f1903; abs.f32 %f161, %f1892; abs.f32 %f162, %f1896; setp.eq.f32 %p204, %f161, 0f00000000; setp.eq.f32 %p205, %f162, 0f00000000; and.pred %p206, %p204, %p205; mov.b32 %r56, %f1892; mov.b32 %r596, %f1896; and.b32 %r57, %r596, -2147483648; @%p206 bra $L__BB0_150; bra.uni $L__BB0_147; $L__BB0_150: shr.s32 %r601, %r56, 31; and.b32 %r602, %r601, 1078530011; or.b32 %r603, %r602, %r57; mov.b32 %f5486, %r603; bra.uni $L__BB0_151; $L__BB0_344: @%p156 bra $L__BB0_346; abs.f32 %f3216, %f5546; setp.gt.f32 %p468, %f3216, 0f461C4000; @%p468 bra $L__BB0_626; $L__BB0_346: ld.global.u16 %rs5, [%rd67]; mov.f32 %f5547, 0f00000000; setp.eq.s16 %p469, %rs5, 0; @%p469 bra $L__BB0_366; setp.ne.s16 %p470, %rs5, 1; @%p470 bra $L__BB0_386; mov.f32 %f5360, 0f3102E308; mov.f32 %f5359, 0fBF317218; mov.f32 %f5358, 0f3FB8AA3B; mov.f32 %f5357, 0f35BFBE8E; mov.f32 %f5356, 0f3F317200; mov.f32 %f5355, 0f3DAAAABD; mov.f32 %f5354, 0f3C4CAF63; mov.f32 %f5353, 0f3B18F0FE; mov.b32 %f5543, %r10; ld.global.u64 %rd844, [%rd67+24]; mul.wide.u32 %rd845, %r8, 16; add.s64 %rd846, %rd844, %rd845; ld.f32 %f442, [%rd846+8]; ld.global.f32 %f443, [%rd67+16]; mul.f32 %f3221, %f442, %f443; mul.f32 %f444, %f3221, 0f3F000000; mul.f32 %f3222, %f5530, %f5530; fma.rn.f32 %f3223, %f5546, %f5546, %f3222; mul.f32 %f3224, %f5635, %f5635; fma.rn.f32 %f3225, %f5531, %f5531, %f3224; add.f32 %f3226, %f3223, 0f00000000; mov.f32 %f3227, 0f00000000; add.f32 %f445, %f3226, %f3225; mov.f32 %f3228, 0fBF000000; cvt.rzi.f32.f32 %f3229, %f3228; add.f32 %f3230, %f3229, %f3229; mov.f32 %f3231, 0fBF800000; sub.f32 %f3232, %f3231, %f3230; abs.f32 %f446, %f3232; abs.f32 %f447, %f439; setp.lt.f32 %p471, %f447, 0f00800000; mul.f32 %f3233, %f447, 0f4B800000; selp.f32 %f3234, %f3233, %f447, %p471; selp.f32 %f3235, 0fC3170000, 0fC2FE0000, %p471; mov.b32 %r1137, %f3234; and.b32 %r1138, %r1137, 8388607; or.b32 %r1139, %r1138, 1065353216; mov.b32 %f3236, %r1139; shr.u32 %r1140, %r1137, 23; cvt.rn.f32.u32 %f3237, %r1140; add.f32 %f3238, %f3235, %f3237; setp.gt.f32 %p472, %f3236, 0f3FB504F3; mul.f32 %f3239, %f3236, 0f3F000000; add.f32 %f3240, %f3238, 0f3F800000; selp.f32 %f3241, %f3240, %f3238, %p472; selp.f32 %f3242, %f3239, %f3236, %p472; add.f32 %f3243, %f3242, 0fBF800000; add.f32 %f3219, %f3242, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f3218,%f3219; // end inline asm add.f32 %f3244, %f3243, %f3243; mul.f32 %f3245, %f3218, %f3244; mul.f32 %f3246, %f3245, %f3245; fma.rn.f32 %f3249, %f5353, %f3246, %f5354; fma.rn.f32 %f3251, %f3249, %f3246, %f5355; mul.rn.f32 %f3252, %f3251, %f3246; mul.rn.f32 %f3253, %f3252, %f3245; sub.f32 %f3254, %f3243, %f3245; add.f32 %f3255, %f3254, %f3254; neg.f32 %f3256, %f3245; fma.rn.f32 %f3257, %f3256, %f3243, %f3255; mul.rn.f32 %f3258, %f3218, %f3257; add.f32 %f3259, %f3253, %f3245; sub.f32 %f3260, %f3245, %f3259; add.f32 %f3261, %f3253, %f3260; add.f32 %f3262, %f3258, %f3261; add.f32 %f3263, %f3259, %f3262; sub.f32 %f3264, %f3259, %f3263; add.f32 %f3265, %f3262, %f3264; mul.rn.f32 %f3267, %f3241, %f5356; mul.rn.f32 %f3269, %f3241, %f5357; add.f32 %f3270, %f3267, %f3263; sub.f32 %f3271, %f3267, %f3270; add.f32 %f3272, %f3263, %f3271; add.f32 %f3273, %f3265, %f3272; add.f32 %f3274, %f3269, %f3273; add.f32 %f3275, %f3270, %f3274; sub.f32 %f3276, %f3270, %f3275; add.f32 %f3277, %f3274, %f3276; mul.rn.f32 %f3278, %f3231, %f3275; neg.f32 %f3279, %f3278; fma.rn.f32 %f3280, %f3231, %f3275, %f3279; fma.rn.f32 %f3281, %f3231, %f3277, %f3280; fma.rn.f32 %f3282, %f3227, %f3275, %f3281; add.rn.f32 %f3283, %f3278, %f3282; neg.f32 %f3284, %f3283; add.rn.f32 %f3285, %f3278, %f3284; add.rn.f32 %f3286, %f3285, %f3282; mov.b32 %r1141, %f3283; setp.eq.s32 %p473, %r1141, 1118925336; add.s32 %r1142, %r1141, -1; mov.b32 %f3287, %r1142; add.f32 %f3288, %f3286, 0f37000000; selp.f32 %f448, %f3288, %f3286, %p473; selp.f32 %f3289, %f3287, %f3283, %p473; mul.rn.f32 %f3291, %f3289, %f5358; cvt.rzi.f32.f32 %f3292, %f3291; abs.f32 %f3293, %f3292; setp.gt.f32 %p474, %f3293, 0f42FC0000; mov.b32 %r1143, %f3292; and.b32 %r1144, %r1143, -2147483648; or.b32 %r1145, %r1144, 1123811328; mov.b32 %f3294, %r1145; selp.f32 %f3295, %f3294, %f3292, %p474; fma.rn.f32 %f3297, %f3295, %f5359, %f3289; fma.rn.f32 %f3299, %f3295, %f5360, %f3297; mul.f32 %f3300, %f3299, 0f3FB8AA3B; add.f32 %f3301, %f3295, 0f4B40007F; mov.b32 %r1146, %f3301; shl.b32 %r1147, %r1146, 23; mov.b32 %f3302, %r1147; ex2.approx.ftz.f32 %f3303, %f3300; mul.f32 %f449, %f3303, %f3302; setp.eq.f32 %p475, %f449, 0f7F800000; mov.f32 %f5537, 0f7F800000; @%p475 bra $L__BB0_350; fma.rn.f32 %f5537, %f449, %f448, %f449; $L__BB0_350: setp.lt.f32 %p476, %f439, 0f00000000; setp.eq.f32 %p477, %f446, 0f3F800000; and.pred %p11, %p476, %p477; setp.eq.f32 %p478, %f439, 0f00000000; @%p478 bra $L__BB0_354; bra.uni $L__BB0_351; $L__BB0_354: add.f32 %f3308, %f439, %f439; mov.b32 %r1150, %f3308; or.b32 %r1151, %r1150, 2139095040; mov.b32 %f3309, %r1151; selp.f32 %f5539, %f3309, 0f7F800000, %p477; bra.uni $L__BB0_355; $L__BB0_117: setp.eq.s16 %p159, %rs12, 4; @%p159 bra $L__BB0_341; setp.ne.s16 %p160, %rs12, 3; @%p160 bra $L__BB0_284; ld.global.u64 %rd584, [%rd67+56]; mul.wide.u32 %rd585, %r8, 16; add.s64 %rd586, %rd584, %rd585; add.s64 %rd95, %rd586, 8; ld.local.v4.f32 {%f1697, %f1698, %f1699, %f1700}, [%rd522]; add.f32 %f1703, %f1700, %f1697; mul.f32 %f124, %f1703, 0f3F000000; sub.f32 %f1704, %f1697, %f1700; mul.f32 %f1705, %f1704, 0f3F000000; add.f32 %f1708, %f1698, %f1699; mul.f32 %f1709, %f1708, 0f3F000000; sub.f32 %f1710, %f1698, %f1699; mul.f32 %f125, %f1710, 0f3F000000; mul.f32 %f1711, %f125, %f125; fma.rn.f32 %f1712, %f124, %f124, %f1711; sqrt.rn.f32 %f1713, %f1712; mul.f32 %f1714, %f1709, %f1709; fma.rn.f32 %f1715, %f1705, %f1705, %f1714; sqrt.rn.f32 %f1716, %f1715; add.f32 %f126, %f1713, %f1716; sub.f32 %f127, %f1713, %f1716; abs.f32 %f128, %f1705; abs.f32 %f129, %f1709; setp.eq.f32 %p163, %f128, 0f00000000; setp.eq.f32 %p164, %f129, 0f00000000; and.pred %p165, %p163, %p164; mov.b32 %r14, %f1705; mov.b32 %r479, %f1709; and.b32 %r15, %r479, -2147483648; @%p165 bra $L__BB0_123; bra.uni $L__BB0_120; $L__BB0_123: shr.s32 %r484, %r14, 31; and.b32 %r485, %r484, 1078530011; or.b32 %r486, %r485, %r15; mov.b32 %f5482, %r486; bra.uni $L__BB0_124; $L__BB0_190: ld.global.u64 %rd710, [%rd67+64]; mul.wide.u32 %rd711, %r8, 16; add.s64 %rd131, %rd710, %rd711; ld.f32 %f5517, [%rd131]; ld.global.f32 %f211, [%rd67+52]; ld.global.f32 %f212, [%rd67+56]; ld.global.f32 %f213, [%rd67+60]; ld.local.v2.u64 {%rd1343, %rd1344}, [%rd522]; mov.b64 {%r728, %r729}, %rd1344; mov.b64 {%r730, %r731}, %rd1343; mov.b32 %f2144, %r730; mov.b32 %f2145, %r729; add.f32 %f2146, %f2145, %f2144; mul.f32 %f214, %f2146, 0f3F000000; sub.f32 %f2147, %f2144, %f2145; mul.f32 %f2148, %f2147, 0f3F000000; mov.b32 %f2149, %r731; mov.b32 %f2150, %r728; add.f32 %f2151, %f2149, %f2150; mul.f32 %f2152, %f2151, 0f3F000000; sub.f32 %f2153, %f2149, %f2150; mul.f32 %f215, %f2153, 0f3F000000; mul.f32 %f2154, %f215, %f215; fma.rn.f32 %f2155, %f214, %f214, %f2154; sqrt.rn.f32 %f2156, %f2155; mul.f32 %f2157, %f2152, %f2152; fma.rn.f32 %f2158, %f2148, %f2148, %f2157; sqrt.rn.f32 %f2159, %f2158; add.f32 %f216, %f2156, %f2159; sub.f32 %f2160, %f2156, %f2159; setp.lt.f32 %p266, %f2160, 0f00000000; selp.f32 %f217, 0fBF800000, 0f3F800000, %p266; mul.f32 %f218, %f2160, %f217; abs.f32 %f219, %f2148; abs.f32 %f220, %f2152; setp.eq.f32 %p267, %f219, 0f00000000; setp.eq.f32 %p268, %f220, 0f00000000; and.pred %p269, %p267, %p268; mov.b32 %r98, %f2148; mov.b32 %r732, %f2152; and.b32 %r99, %r732, -2147483648; @%p269 bra $L__BB0_194; bra.uni $L__BB0_191; $L__BB0_194: shr.s32 %r737, %r98, 31; and.b32 %r738, %r737, 1078530011; or.b32 %r739, %r738, %r99; mov.b32 %f5494, %r739; bra.uni $L__BB0_195; $L__BB0_284: mov.b32 %f2829, %r10; ld.global.u64 %rd751, [%rd67+72]; mul.wide.u32 %rd752, %r8, 16; add.s64 %rd753, %rd751, %rd752; add.s64 %rd156, %rd753, 4; ld.global.u8 %rs25, [%rd67+64]; setp.ne.s16 %p390, %rs25, 0; setp.neu.f32 %p391, %f2829, 0f00000000; and.pred %p392, %p391, %p390; @%p392 bra $L__BB0_341; ld.local.v4.f32 {%f2830, %f2831, %f2832, %f2833}, [%rd522]; add.f32 %f2836, %f2833, %f2830; mul.f32 %f348, %f2836, 0f3F000000; sub.f32 %f2837, %f2830, %f2833; mul.f32 %f2838, %f2837, 0f3F000000; add.f32 %f2841, %f2831, %f2832; mul.f32 %f2842, %f2841, 0f3F000000; sub.f32 %f2843, %f2831, %f2832; mul.f32 %f349, %f2843, 0f3F000000; mul.f32 %f2844, %f349, %f349; fma.rn.f32 %f2845, %f348, %f348, %f2844; sqrt.rn.f32 %f2846, %f2845; mul.f32 %f2847, %f2842, %f2842; fma.rn.f32 %f2848, %f2838, %f2838, %f2847; sqrt.rn.f32 %f2849, %f2848; add.f32 %f350, %f2846, %f2849; sub.f32 %f2850, %f2846, %f2849; setp.lt.f32 %p393, %f2850, 0f00000000; selp.f32 %f351, 0fBF800000, 0f3F800000, %p393; mul.f32 %f352, %f2850, %f351; abs.f32 %f353, %f2838; abs.f32 %f354, %f2842; setp.eq.f32 %p394, %f353, 0f00000000; setp.eq.f32 %p395, %f354, 0f00000000; and.pred %p396, %p394, %p395; mov.b32 %r141, %f2838; mov.b32 %r952, %f2842; and.b32 %r142, %r952, -2147483648; @%p396 bra $L__BB0_289; bra.uni $L__BB0_286; $L__BB0_289: shr.s32 %r957, %r141, 31; and.b32 %r958, %r957, 1078530011; or.b32 %r959, %r958, %r142; mov.b32 %f5518, %r959; bra.uni $L__BB0_290; $L__BB0_366: add.u64 %rd1296, %SPL, 80; ld.global.u64 %rd847, [%rd67+24]; mul.wide.u32 %rd848, %r8, 16; add.s64 %rd849, %rd847, %rd848; ld.f32 %f470, [%rd849+8]; ld.local.v4.f32 {%f5546, %f3353, %f3354, %f3355}, [%rd1296]; mul.f32 %f3359, %f3355, %f5546; mul.f32 %f3360, %f3354, %f3353; sub.f32 %f472, %f3359, %f3360; add.f32 %f3361, %f3355, %f5546; mul.f32 %f473, %f3361, 0f3F000000; sub.f32 %f3362, %f5546, %f3355; mul.f32 %f3363, %f3362, 0f3F000000; add.f32 %f3364, %f3353, %f3354; mul.f32 %f3365, %f3364, 0f3F000000; sub.f32 %f3366, %f3353, %f3354; mul.f32 %f474, %f3366, 0f3F000000; mul.f32 %f3367, %f474, %f474; fma.rn.f32 %f475, %f473, %f473, %f3367; mul.f32 %f3368, %f3365, %f3365; fma.rn.f32 %f476, %f3363, %f3363, %f3368; abs.f32 %f477, %f3363; abs.f32 %f478, %f3365; setp.eq.f32 %p490, %f477, 0f00000000; setp.eq.f32 %p491, %f478, 0f00000000; and.pred %p492, %p490, %p491; mov.b32 %r203, %f3363; mov.b32 %r1157, %f3365; and.b32 %r204, %r1157, -2147483648; @%p492 bra $L__BB0_370; bra.uni $L__BB0_367; $L__BB0_370: shr.s32 %r1162, %r203, 31; and.b32 %r1163, %r1162, 1078530011; or.b32 %r1164, %r1163, %r204; mov.b32 %f5544, %r1164; bra.uni $L__BB0_371; $L__BB0_191: setp.eq.f32 %p270, %f219, 0f7F800000; setp.eq.f32 %p271, %f220, 0f7F800000; and.pred %p272, %p270, %p271; @%p272 bra $L__BB0_193; bra.uni $L__BB0_192; $L__BB0_193: setp.lt.s32 %p276, %r98, 0; selp.b32 %r735, 1075235812, 1061752795, %p276; or.b32 %r736, %r735, %r99; mov.b32 %f5494, %r736; bra.uni $L__BB0_195; $L__BB0_147: setp.eq.f32 %p207, %f161, 0f7F800000; setp.eq.f32 %p208, %f162, 0f7F800000; and.pred %p209, %p207, %p208; @%p209 bra $L__BB0_149; bra.uni $L__BB0_148; $L__BB0_149: setp.lt.s32 %p213, %r56, 0; selp.b32 %r599, 1075235812, 1061752795, %p213; or.b32 %r600, %r599, %r57; mov.b32 %f5486, %r600; bra.uni $L__BB0_151; $L__BB0_120: setp.eq.f32 %p166, %f128, 0f7F800000; setp.eq.f32 %p167, %f129, 0f7F800000; and.pred %p168, %p166, %p167; @%p168 bra $L__BB0_122; bra.uni $L__BB0_121; $L__BB0_122: setp.lt.s32 %p172, %r14, 0; selp.b32 %r482, 1075235812, 1061752795, %p172; or.b32 %r483, %r482, %r15; mov.b32 %f5482, %r483; bra.uni $L__BB0_124; $L__BB0_367: setp.eq.f32 %p493, %f477, 0f7F800000; setp.eq.f32 %p494, %f478, 0f7F800000; and.pred %p495, %p493, %p494; @%p495 bra $L__BB0_369; bra.uni $L__BB0_368; $L__BB0_369: setp.lt.s32 %p499, %r203, 0; selp.b32 %r1160, 1075235812, 1061752795, %p499; or.b32 %r1161, %r1160, %r204; mov.b32 %f5544, %r1161; bra.uni $L__BB0_371; $L__BB0_351: mov.b32 %r1148, %f5537; xor.b32 %r1149, %r1148, -2147483648; mov.b32 %f3304, %r1149; selp.f32 %f5539, %f3304, %f5537, %p11; setp.geu.f32 %p479, %f439, 0f00000000; @%p479 bra $L__BB0_355; cvt.rzi.f32.f32 %f3306, %f3231; setp.eq.f32 %p480, %f3306, 0fBF800000; @%p480 bra $L__BB0_355; mov.f32 %f5539, 0f7FFFFFFF; $L__BB0_355: add.f32 %f3310, %f447, 0f3F800000; mov.b32 %r1152, %f3310; setp.lt.s32 %p482, %r1152, 2139095040; @%p482 bra $L__BB0_360; setp.gtu.f32 %p483, %f447, 0f7F800000; @%p483 bra $L__BB0_359; bra.uni $L__BB0_357; $L__BB0_359: add.f32 %f5539, %f439, 0fBF800000; bra.uni $L__BB0_360; $L__BB0_192: setp.lt.s32 %p273, %r98, 0; min.f32 %f2161, %f220, %f219; max.f32 %f2162, %f220, %f219; div.rn.f32 %f2163, %f2161, %f2162; mul.rn.f32 %f2164, %f2163, %f2163; mov.f32 %f2165, 0fC0B59883; mov.f32 %f2166, 0fBF52C7EA; fma.rn.f32 %f2167, %f2164, %f2166, %f2165; mov.f32 %f2168, 0fC0D21907; fma.rn.f32 %f2169, %f2167, %f2164, %f2168; mul.f32 %f2170, %f2164, %f2169; mul.f32 %f2171, %f2163, %f2170; add.f32 %f2172, %f2164, 0f41355DC0; mov.f32 %f2173, 0f41E6BD60; fma.rn.f32 %f2174, %f2172, %f2164, %f2173; mov.f32 %f2175, 0f419D92C8; fma.rn.f32 %f2176, %f2174, %f2164, %f2175; rcp.rn.f32 %f2177, %f2176; fma.rn.f32 %f2178, %f2171, %f2177, %f2163; mov.f32 %f2179, 0f3FC90FDB; sub.f32 %f2180, %f2179, %f2178; setp.gt.f32 %p274, %f220, %f219; selp.f32 %f2181, %f2180, %f2178, %p274; mov.f32 %f2182, 0f40490FDB; sub.f32 %f2183, %f2182, %f2181; selp.f32 %f2184, %f2183, %f2181, %p273; mov.b32 %r733, %f2184; or.b32 %r734, %r99, %r733; mov.b32 %f2185, %r734; add.f32 %f2186, %f219, %f220; setp.le.f32 %p275, %f2186, 0f7F800000; selp.f32 %f5494, %f2185, %f2186, %p275; $L__BB0_195: abs.f32 %f225, %f214; setp.eq.f32 %p277, %f225, 0f00000000; abs.f32 %f226, %f215; setp.eq.f32 %p278, %f226, 0f00000000; and.pred %p279, %p277, %p278; mov.b32 %r100, %f214; mov.b32 %r740, %f215; and.b32 %r101, %r740, -2147483648; @%p279 bra $L__BB0_199; bra.uni $L__BB0_196; $L__BB0_199: shr.s32 %r745, %r100, 31; and.b32 %r746, %r745, 1078530011; or.b32 %r747, %r746, %r101; mov.b32 %f5495, %r747; bra.uni $L__BB0_200; $L__BB0_196: setp.eq.f32 %p280, %f225, 0f7F800000; setp.eq.f32 %p281, %f226, 0f7F800000; and.pred %p282, %p280, %p281; @%p282 bra $L__BB0_198; bra.uni $L__BB0_197; $L__BB0_198: setp.lt.s32 %p286, %r100, 0; selp.b32 %r743, 1075235812, 1061752795, %p286; or.b32 %r744, %r743, %r101; mov.b32 %f5495, %r744; bra.uni $L__BB0_200; $L__BB0_197: setp.lt.s32 %p283, %r100, 0; min.f32 %f2187, %f226, %f225; max.f32 %f2188, %f226, %f225; div.rn.f32 %f2189, %f2187, %f2188; mul.rn.f32 %f2190, %f2189, %f2189; mov.f32 %f2191, 0fC0B59883; mov.f32 %f2192, 0fBF52C7EA; fma.rn.f32 %f2193, %f2190, %f2192, %f2191; mov.f32 %f2194, 0fC0D21907; fma.rn.f32 %f2195, %f2193, %f2190, %f2194; mul.f32 %f2196, %f2190, %f2195; mul.f32 %f2197, %f2189, %f2196; add.f32 %f2198, %f2190, 0f41355DC0; mov.f32 %f2199, 0f41E6BD60; fma.rn.f32 %f2200, %f2198, %f2190, %f2199; mov.f32 %f2201, 0f419D92C8; fma.rn.f32 %f2202, %f2200, %f2190, %f2201; rcp.rn.f32 %f2203, %f2202; fma.rn.f32 %f2204, %f2197, %f2203, %f2189; mov.f32 %f2205, 0f3FC90FDB; sub.f32 %f2206, %f2205, %f2204; setp.gt.f32 %p284, %f226, %f225; selp.f32 %f2207, %f2206, %f2204, %p284; mov.f32 %f2208, 0f40490FDB; sub.f32 %f2209, %f2208, %f2207; selp.f32 %f2210, %f2209, %f2207, %p283; mov.b32 %r741, %f2210; or.b32 %r742, %r101, %r741; mov.b32 %f2211, %r742; add.f32 %f2212, %f225, %f226; setp.le.f32 %p285, %f2212, 0f7F800000; selp.f32 %f5495, %f2211, %f2212, %p285; $L__BB0_200: sub.f32 %f2213, %f5495, %f5494; mul.f32 %f231, %f2213, 0f3F000000; add.f32 %f2214, %f5494, %f5495; mul.f32 %f232, %f2214, 0f3F000000; mul.f32 %f2215, %f231, 0f3F22F983; cvt.rni.s32.f32 %r1747, %f2215; cvt.rn.f32.s32 %f2216, %r1747; mov.f32 %f2217, 0fBFC90FDA; fma.rn.f32 %f2218, %f2216, %f2217, %f231; mov.f32 %f2219, 0fB3A22168; fma.rn.f32 %f2220, %f2216, %f2219, %f2218; mov.f32 %f2221, 0fA7C234C5; fma.rn.f32 %f5496, %f2216, %f2221, %f2220; abs.f32 %f234, %f231; setp.leu.f32 %p287, %f234, 0f47CE4780; @%p287 bra $L__BB0_208; setp.eq.f32 %p288, %f234, 0f7F800000; @%p288 bra $L__BB0_207; bra.uni $L__BB0_202; $L__BB0_207: mul.rn.f32 %f5496, %f231, %f1058; bra.uni $L__BB0_208; $L__BB0_148: setp.lt.s32 %p210, %r56, 0; min.f32 %f1904, %f162, %f161; max.f32 %f1905, %f162, %f161; div.rn.f32 %f1906, %f1904, %f1905; mul.rn.f32 %f1907, %f1906, %f1906; mov.f32 %f1908, 0fC0B59883; mov.f32 %f1909, 0fBF52C7EA; fma.rn.f32 %f1910, %f1907, %f1909, %f1908; mov.f32 %f1911, 0fC0D21907; fma.rn.f32 %f1912, %f1910, %f1907, %f1911; mul.f32 %f1913, %f1907, %f1912; mul.f32 %f1914, %f1906, %f1913; add.f32 %f1915, %f1907, 0f41355DC0; mov.f32 %f1916, 0f41E6BD60; fma.rn.f32 %f1917, %f1915, %f1907, %f1916; mov.f32 %f1918, 0f419D92C8; fma.rn.f32 %f1919, %f1917, %f1907, %f1918; rcp.rn.f32 %f1920, %f1919; fma.rn.f32 %f1921, %f1914, %f1920, %f1906; mov.f32 %f1922, 0f3FC90FDB; sub.f32 %f1923, %f1922, %f1921; setp.gt.f32 %p211, %f162, %f161; selp.f32 %f1924, %f1923, %f1921, %p211; mov.f32 %f1925, 0f40490FDB; sub.f32 %f1926, %f1925, %f1924; selp.f32 %f1927, %f1926, %f1924, %p210; mov.b32 %r597, %f1927; or.b32 %r598, %r57, %r597; mov.b32 %f1928, %r598; add.f32 %f1929, %f161, %f162; setp.le.f32 %p212, %f1929, 0f7F800000; selp.f32 %f5486, %f1928, %f1929, %p212; $L__BB0_151: abs.f32 %f167, %f157; setp.eq.f32 %p214, %f167, 0f00000000; abs.f32 %f168, %f158; setp.eq.f32 %p215, %f168, 0f00000000; and.pred %p216, %p214, %p215; mov.b32 %r58, %f157; mov.b32 %r604, %f158; and.b32 %r59, %r604, -2147483648; @%p216 bra $L__BB0_155; bra.uni $L__BB0_152; $L__BB0_155: shr.s32 %r609, %r58, 31; and.b32 %r610, %r609, 1078530011; or.b32 %r611, %r610, %r59; mov.b32 %f5487, %r611; bra.uni $L__BB0_156; $L__BB0_152: setp.eq.f32 %p217, %f167, 0f7F800000; setp.eq.f32 %p218, %f168, 0f7F800000; and.pred %p219, %p217, %p218; @%p219 bra $L__BB0_154; bra.uni $L__BB0_153; $L__BB0_154: setp.lt.s32 %p223, %r58, 0; selp.b32 %r607, 1075235812, 1061752795, %p223; or.b32 %r608, %r607, %r59; mov.b32 %f5487, %r608; bra.uni $L__BB0_156; $L__BB0_121: setp.lt.s32 %p169, %r14, 0; min.f32 %f1717, %f129, %f128; max.f32 %f1718, %f129, %f128; div.rn.f32 %f1719, %f1717, %f1718; mul.rn.f32 %f1720, %f1719, %f1719; mov.f32 %f1721, 0fC0B59883; mov.f32 %f1722, 0fBF52C7EA; fma.rn.f32 %f1723, %f1720, %f1722, %f1721; mov.f32 %f1724, 0fC0D21907; fma.rn.f32 %f1725, %f1723, %f1720, %f1724; mul.f32 %f1726, %f1720, %f1725; mul.f32 %f1727, %f1719, %f1726; add.f32 %f1728, %f1720, 0f41355DC0; mov.f32 %f1729, 0f41E6BD60; fma.rn.f32 %f1730, %f1728, %f1720, %f1729; mov.f32 %f1731, 0f419D92C8; fma.rn.f32 %f1732, %f1730, %f1720, %f1731; rcp.rn.f32 %f1733, %f1732; fma.rn.f32 %f1734, %f1727, %f1733, %f1719; mov.f32 %f1735, 0f3FC90FDB; sub.f32 %f1736, %f1735, %f1734; setp.gt.f32 %p170, %f129, %f128; selp.f32 %f1737, %f1736, %f1734, %p170; mov.f32 %f1738, 0f40490FDB; sub.f32 %f1739, %f1738, %f1737; selp.f32 %f1740, %f1739, %f1737, %p169; mov.b32 %r480, %f1740; or.b32 %r481, %r15, %r480; mov.b32 %f1741, %r481; add.f32 %f1742, %f128, %f129; setp.le.f32 %p171, %f1742, 0f7F800000; selp.f32 %f5482, %f1741, %f1742, %p171; $L__BB0_124: abs.f32 %f134, %f124; setp.eq.f32 %p173, %f134, 0f00000000; abs.f32 %f135, %f125; setp.eq.f32 %p174, %f135, 0f00000000; and.pred %p175, %p173, %p174; mov.b32 %r16, %f124; mov.b32 %r487, %f125; and.b32 %r17, %r487, -2147483648; @%p175 bra $L__BB0_128; bra.uni $L__BB0_125; $L__BB0_128: shr.s32 %r492, %r16, 31; and.b32 %r493, %r492, 1078530011; or.b32 %r494, %r493, %r17; mov.b32 %f5483, %r494; bra.uni $L__BB0_129; $L__BB0_125: setp.eq.f32 %p176, %f134, 0f7F800000; setp.eq.f32 %p177, %f135, 0f7F800000; and.pred %p178, %p176, %p177; @%p178 bra $L__BB0_127; bra.uni $L__BB0_126; $L__BB0_127: setp.lt.s32 %p182, %r16, 0; selp.b32 %r490, 1075235812, 1061752795, %p182; or.b32 %r491, %r490, %r17; mov.b32 %f5483, %r491; bra.uni $L__BB0_129; $L__BB0_153: setp.lt.s32 %p220, %r58, 0; min.f32 %f1930, %f168, %f167; max.f32 %f1931, %f168, %f167; div.rn.f32 %f1932, %f1930, %f1931; mul.rn.f32 %f1933, %f1932, %f1932; mov.f32 %f1934, 0fC0B59883; mov.f32 %f1935, 0fBF52C7EA; fma.rn.f32 %f1936, %f1933, %f1935, %f1934; mov.f32 %f1937, 0fC0D21907; fma.rn.f32 %f1938, %f1936, %f1933, %f1937; mul.f32 %f1939, %f1933, %f1938; mul.f32 %f1940, %f1932, %f1939; add.f32 %f1941, %f1933, 0f41355DC0; mov.f32 %f1942, 0f41E6BD60; fma.rn.f32 %f1943, %f1941, %f1933, %f1942; mov.f32 %f1944, 0f419D92C8; fma.rn.f32 %f1945, %f1943, %f1933, %f1944; rcp.rn.f32 %f1946, %f1945; fma.rn.f32 %f1947, %f1940, %f1946, %f1932; mov.f32 %f1948, 0f3FC90FDB; sub.f32 %f1949, %f1948, %f1947; setp.gt.f32 %p221, %f168, %f167; selp.f32 %f1950, %f1949, %f1947, %p221; mov.f32 %f1951, 0f40490FDB; sub.f32 %f1952, %f1951, %f1950; selp.f32 %f1953, %f1952, %f1950, %p220; mov.b32 %r605, %f1953; or.b32 %r606, %r59, %r605; mov.b32 %f1954, %r606; add.f32 %f1955, %f167, %f168; setp.le.f32 %p222, %f1955, 0f7F800000; selp.f32 %f5487, %f1954, %f1955, %p222; $L__BB0_156: sub.f32 %f1956, %f5487, %f5486; mul.f32 %f173, %f1956, 0f3F000000; add.f32 %f1957, %f5486, %f5487; mul.f32 %f174, %f1957, 0f3F000000; mul.f32 %f1958, %f173, 0f3F22F983; cvt.rni.s32.f32 %r1737, %f1958; cvt.rn.f32.s32 %f1959, %r1737; mov.f32 %f1960, 0fBFC90FDA; fma.rn.f32 %f1961, %f1959, %f1960, %f173; mov.f32 %f1962, 0fB3A22168; fma.rn.f32 %f1963, %f1959, %f1962, %f1961; mov.f32 %f1964, 0fA7C234C5; fma.rn.f32 %f5488, %f1959, %f1964, %f1963; abs.f32 %f176, %f173; setp.leu.f32 %p224, %f176, 0f47CE4780; @%p224 bra $L__BB0_164; setp.eq.f32 %p225, %f176, 0f7F800000; @%p225 bra $L__BB0_163; bra.uni $L__BB0_158; $L__BB0_163: mul.rn.f32 %f5488, %f173, %f1058; bra.uni $L__BB0_164; $L__BB0_126: setp.lt.s32 %p179, %r16, 0; min.f32 %f1743, %f135, %f134; max.f32 %f1744, %f135, %f134; div.rn.f32 %f1745, %f1743, %f1744; mul.rn.f32 %f1746, %f1745, %f1745; mov.f32 %f1747, 0fC0B59883; mov.f32 %f1748, 0fBF52C7EA; fma.rn.f32 %f1749, %f1746, %f1748, %f1747; mov.f32 %f1750, 0fC0D21907; fma.rn.f32 %f1751, %f1749, %f1746, %f1750; mul.f32 %f1752, %f1746, %f1751; mul.f32 %f1753, %f1745, %f1752; add.f32 %f1754, %f1746, 0f41355DC0; mov.f32 %f1755, 0f41E6BD60; fma.rn.f32 %f1756, %f1754, %f1746, %f1755; mov.f32 %f1757, 0f419D92C8; fma.rn.f32 %f1758, %f1756, %f1746, %f1757; rcp.rn.f32 %f1759, %f1758; fma.rn.f32 %f1760, %f1753, %f1759, %f1745; mov.f32 %f1761, 0f3FC90FDB; sub.f32 %f1762, %f1761, %f1760; setp.gt.f32 %p180, %f135, %f134; selp.f32 %f1763, %f1762, %f1760, %p180; mov.f32 %f1764, 0f40490FDB; sub.f32 %f1765, %f1764, %f1763; selp.f32 %f1766, %f1765, %f1763, %p179; mov.b32 %r488, %f1766; or.b32 %r489, %r17, %r488; mov.b32 %f1767, %r489; add.f32 %f1768, %f134, %f135; setp.le.f32 %p181, %f1768, 0f7F800000; selp.f32 %f5483, %f1767, %f1768, %p181; $L__BB0_129: sub.f32 %f1769, %f5483, %f5482; mul.f32 %f140, %f1769, 0f3F000000; add.f32 %f1770, %f5482, %f5483; mul.f32 %f141, %f1770, 0f3F000000; mul.f32 %f1771, %f140, 0f3F22F983; cvt.rni.s32.f32 %r1727, %f1771; cvt.rn.f32.s32 %f1772, %r1727; mov.f32 %f1773, 0fBFC90FDA; fma.rn.f32 %f1774, %f1772, %f1773, %f140; mov.f32 %f1775, 0fB3A22168; fma.rn.f32 %f1776, %f1772, %f1775, %f1774; mov.f32 %f1777, 0fA7C234C5; fma.rn.f32 %f5484, %f1772, %f1777, %f1776; abs.f32 %f143, %f140; setp.leu.f32 %p183, %f143, 0f47CE4780; @%p183 bra $L__BB0_137; setp.eq.f32 %p184, %f143, 0f7F800000; @%p184 bra $L__BB0_136; bra.uni $L__BB0_131; $L__BB0_136: mul.rn.f32 %f5484, %f140, %f1058; bra.uni $L__BB0_137; $L__BB0_286: setp.eq.f32 %p397, %f353, 0f7F800000; setp.eq.f32 %p398, %f354, 0f7F800000; and.pred %p399, %p397, %p398; @%p399 bra $L__BB0_288; bra.uni $L__BB0_287; $L__BB0_288: setp.lt.s32 %p403, %r141, 0; selp.b32 %r955, 1075235812, 1061752795, %p403; or.b32 %r956, %r955, %r142; mov.b32 %f5518, %r956; bra.uni $L__BB0_290; $L__BB0_357: setp.neu.f32 %p484, %f447, 0f7F800000; @%p484 bra $L__BB0_360; selp.f32 %f5539, 0f80000000, 0f00000000, %p11; $L__BB0_360: setp.eq.f32 %p485, %f439, 0f3F800000; selp.f32 %f3311, 0f3F800000, %f5539, %p485; fma.rn.f32 %f3312, %f445, %f3311, 0fC0000000; mul.f32 %f458, %f444, %f3312; setp.lt.f32 %p486, %f439, 0f3F800000; @%p486 bra $L__BB0_364; bra.uni $L__BB0_361; $L__BB0_364: mul.f32 %f3350, %f5543, 0f3F7FBE77; mul.f32 %f5542, %f3350, %f5543; mov.f32 %f5541, 0f3A83126F; mov.f32 %f5543, %f458; bra.uni $L__BB0_365; $L__BB0_361: ld.global.f32 %f459, [%rd67+12]; mul.f32 %f3313, %f439, 0f4B000000; setp.lt.f32 %p487, %f439, 0f00800000; selp.f32 %f460, %f3313, %f439, %p487; selp.f32 %f3314, 0fC1B80000, 0f00000000, %p487; mov.b32 %r1153, %f460; add.s32 %r1154, %r1153, -1059760811; and.b32 %r1155, %r1154, -8388608; sub.s32 %r1156, %r1153, %r1155; mov.b32 %f3315, %r1156; cvt.rn.f32.s32 %f3316, %r1155; mov.f32 %f3317, 0f34000000; fma.rn.f32 %f3318, %f3316, %f3317, %f3314; add.f32 %f3319, %f3315, 0fBF800000; mov.f32 %f3320, 0f3E1039F6; mov.f32 %f3321, 0fBE055027; fma.rn.f32 %f3322, %f3321, %f3319, %f3320; mov.f32 %f3323, 0fBDF8CDCC; fma.rn.f32 %f3324, %f3322, %f3319, %f3323; mov.f32 %f3325, 0f3E0F2955; fma.rn.f32 %f3326, %f3324, %f3319, %f3325; mov.f32 %f3327, 0fBE2AD8B9; fma.rn.f32 %f3328, %f3326, %f3319, %f3327; mov.f32 %f3329, 0f3E4CED0B; fma.rn.f32 %f3330, %f3328, %f3319, %f3329; mov.f32 %f3331, 0fBE7FFF22; fma.rn.f32 %f3332, %f3330, %f3319, %f3331; mov.f32 %f3333, 0f3EAAAA78; fma.rn.f32 %f3334, %f3332, %f3319, %f3333; fma.rn.f32 %f3336, %f3334, %f3319, %f3228; mul.f32 %f3337, %f3319, %f3336; fma.rn.f32 %f3338, %f3337, %f3319, %f3319; mov.f32 %f3339, 0f3F317218; fma.rn.f32 %f5540, %f3318, %f3339, %f3338; setp.lt.u32 %p488, %r1153, 2139095040; @%p488 bra $L__BB0_363; mov.f32 %f3340, 0f7F800000; fma.rn.f32 %f5540, %f460, %f3340, %f3340; $L__BB0_363: setp.eq.f32 %p489, %f460, 0f00000000; selp.f32 %f3341, 0fFF800000, %f5540, %p489; mul.f32 %f3342, %f443, 0f3F2AAAAB; mul.f32 %f3343, %f442, %f459; fma.rn.f32 %f3344, %f442, %f3342, %f3343; mul.f32 %f3345, %f3344, 0f3F000000; fma.rn.f32 %f3346, %f439, %f439, 0fBF800000; mul.f32 %f3347, %f3346, 0f3F000000; sub.f32 %f3348, %f3347, %f3341; mul.f32 %f5541, %f3345, %f3348; mov.f32 %f5542, %f458; $L__BB0_365: add.f32 %f3351, %f5541, %f5542; mul.f32 %f5547, %f3351, %f5543; bra.uni $L__BB0_386; $L__BB0_368: setp.lt.s32 %p496, %r203, 0; min.f32 %f3369, %f478, %f477; max.f32 %f3370, %f478, %f477; div.rn.f32 %f3371, %f3369, %f3370; mul.rn.f32 %f3372, %f3371, %f3371; mov.f32 %f3373, 0fC0B59883; mov.f32 %f3374, 0fBF52C7EA; fma.rn.f32 %f3375, %f3372, %f3374, %f3373; mov.f32 %f3376, 0fC0D21907; fma.rn.f32 %f3377, %f3375, %f3372, %f3376; mul.f32 %f3378, %f3372, %f3377; mul.f32 %f3379, %f3371, %f3378; add.f32 %f3380, %f3372, 0f41355DC0; mov.f32 %f3381, 0f41E6BD60; fma.rn.f32 %f3382, %f3380, %f3372, %f3381; mov.f32 %f3383, 0f419D92C8; fma.rn.f32 %f3384, %f3382, %f3372, %f3383; rcp.rn.f32 %f3385, %f3384; fma.rn.f32 %f3386, %f3379, %f3385, %f3371; mov.f32 %f3387, 0f3FC90FDB; sub.f32 %f3388, %f3387, %f3386; setp.gt.f32 %p497, %f478, %f477; selp.f32 %f3389, %f3388, %f3386, %p497; mov.f32 %f3390, 0f40490FDB; sub.f32 %f3391, %f3390, %f3389; selp.f32 %f3392, %f3391, %f3389, %p496; mov.b32 %r1158, %f3392; or.b32 %r1159, %r204, %r1158; mov.b32 %f3393, %r1159; add.f32 %f3394, %f477, %f478; setp.le.f32 %p498, %f3394, 0f7F800000; selp.f32 %f5544, %f3393, %f3394, %p498; $L__BB0_371: abs.f32 %f483, %f473; setp.eq.f32 %p500, %f483, 0f00000000; abs.f32 %f484, %f474; setp.eq.f32 %p501, %f484, 0f00000000; and.pred %p502, %p500, %p501; mov.b32 %r205, %f473; mov.b32 %r1165, %f474; and.b32 %r206, %r1165, -2147483648; @%p502 bra $L__BB0_375; bra.uni $L__BB0_372; $L__BB0_375: shr.s32 %r1170, %r205, 31; and.b32 %r1171, %r1170, 1078530011; or.b32 %r1172, %r1171, %r206; mov.b32 %f5545, %r1172; bra.uni $L__BB0_376; $L__BB0_372: setp.eq.f32 %p503, %f483, 0f7F800000; setp.eq.f32 %p504, %f484, 0f7F800000; and.pred %p505, %p503, %p504; @%p505 bra $L__BB0_374; bra.uni $L__BB0_373; $L__BB0_374: setp.lt.s32 %p509, %r205, 0; selp.b32 %r1168, 1075235812, 1061752795, %p509; or.b32 %r1169, %r1168, %r206; mov.b32 %f5545, %r1169; bra.uni $L__BB0_376; $L__BB0_373: setp.lt.s32 %p506, %r205, 0; min.f32 %f3395, %f484, %f483; max.f32 %f3396, %f484, %f483; div.rn.f32 %f3397, %f3395, %f3396; mul.rn.f32 %f3398, %f3397, %f3397; mov.f32 %f3399, 0fC0B59883; mov.f32 %f3400, 0fBF52C7EA; fma.rn.f32 %f3401, %f3398, %f3400, %f3399; mov.f32 %f3402, 0fC0D21907; fma.rn.f32 %f3403, %f3401, %f3398, %f3402; mul.f32 %f3404, %f3398, %f3403; mul.f32 %f3405, %f3397, %f3404; add.f32 %f3406, %f3398, 0f41355DC0; mov.f32 %f3407, 0f41E6BD60; fma.rn.f32 %f3408, %f3406, %f3398, %f3407; mov.f32 %f3409, 0f419D92C8; fma.rn.f32 %f3410, %f3408, %f3398, %f3409; rcp.rn.f32 %f3411, %f3410; fma.rn.f32 %f3412, %f3405, %f3411, %f3397; mov.f32 %f3413, 0f3FC90FDB; sub.f32 %f3414, %f3413, %f3412; setp.gt.f32 %p507, %f484, %f483; selp.f32 %f3415, %f3414, %f3412, %p507; mov.f32 %f3416, 0f40490FDB; sub.f32 %f3417, %f3416, %f3415; selp.f32 %f3418, %f3417, %f3415, %p506; mov.b32 %r1166, %f3418; or.b32 %r1167, %r206, %r1166; mov.b32 %f3419, %r1167; add.f32 %f3420, %f483, %f484; setp.le.f32 %p508, %f3420, 0f7F800000; selp.f32 %f5545, %f3419, %f3420, %p508; $L__BB0_376: sub.f32 %f3421, %f5545, %f5544; mul.f32 %f489, %f3421, 0f3F000000; add.f32 %f3422, %f5544, %f5545; mul.f32 %f490, %f3422, 0f3F000000; abs.f32 %f3423, %f489; setp.leu.f32 %p510, %f3423, 0f47CE4780; setp.eq.f32 %p511, %f3423, 0f7F800000; or.pred %p512, %p510, %p511; @%p512 bra $L__BB0_380; mov.b32 %r1175, %f489; shl.b32 %r1176, %r1175, 8; or.b32 %r207, %r1176, -2147483648; add.u64 %rd853, %SP, 32; add.u64 %rd1368, %SPL, 32; mov.u32 %r1768, 0; mov.u64 %rd1367, __cudart_i2opi_f; mov.u32 %r1769, %r1768; $L__BB0_378: .pragma "nounroll"; mov.u32 %r209, %r1769; ld.global.nc.u32 %r1179, [%rd1367]; // begin inline asm { mad.lo.cc.u32 %r1177, %r1179, %r207, %r209; madc.hi.u32 %r1769, %r1179, %r207, 0; } // end inline asm st.local.u32 [%rd1368], %r1177; add.s64 %rd1368, %rd1368, 4; add.s64 %rd1367, %rd1367, 4; add.s32 %r1768, %r1768, 1; setp.ne.s32 %p513, %r1768, 6; @%p513 bra $L__BB0_378; mov.u32 %r1184, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1182, %r1184, %r207, %r209; madc.hi.u32 %r1183, %r1184, %r207, 0; } // end inline asm cvta.to.local.u64 %rd855, %rd853; st.local.u32 [%rd855+24], %r1183; $L__BB0_380: abs.f32 %f3424, %f490; setp.leu.f32 %p514, %f3424, 0f47CE4780; setp.eq.f32 %p515, %f3424, 0f7F800000; or.pred %p516, %p514, %p515; @%p516 bra $L__BB0_384; mov.b32 %r1189, %f490; shl.b32 %r1190, %r1189, 8; or.b32 %r212, %r1190, -2147483648; add.u64 %rd857, %SP, 32; add.u64 %rd1370, %SPL, 32; mov.u32 %r1770, 0; mov.u64 %rd1369, __cudart_i2opi_f; mov.u32 %r1771, %r1770; $L__BB0_382: .pragma "nounroll"; mov.u32 %r214, %r1771; ld.global.nc.u32 %r1193, [%rd1369]; // begin inline asm { mad.lo.cc.u32 %r1191, %r1193, %r212, %r214; madc.hi.u32 %r1771, %r1193, %r212, 0; } // end inline asm st.local.u32 [%rd1370], %r1191; add.s64 %rd1370, %rd1370, 4; add.s64 %rd1369, %rd1369, 4; add.s32 %r1770, %r1770, 1; setp.ne.s32 %p517, %r1770, 6; @%p517 bra $L__BB0_382; mov.u32 %r1198, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1196, %r1198, %r212, %r214; madc.hi.u32 %r1197, %r1198, %r212, 0; } // end inline asm cvta.to.local.u64 %rd859, %rd857; st.local.u32 [%rd859+24], %r1197; $L__BB0_384: sqrt.rn.f32 %f3425, %f475; sqrt.rn.f32 %f3426, %f476; sub.f32 %f3427, %f3425, %f3426; add.f32 %f3428, %f3425, %f3426; add.f32 %f3429, %f3428, 0fBF800000; mov.f32 %f3430, 0f00000000; max.f32 %f3431, %f3429, %f3430; setp.lt.f32 %p518, %f3427, 0f00000000; selp.f32 %f3432, 0fBF800000, 0f3F800000, %p518; fma.rn.f32 %f3433, %f3427, %f3432, 0fBF800000; max.f32 %f3434, %f3433, %f3430; ld.global.f32 %f3435, [%rd67+20]; mul.f32 %f3436, %f470, %f3435; mul.f32 %f3437, %f3434, %f3434; fma.rn.f32 %f3438, %f3431, %f3431, %f3437; add.f32 %f3439, %f3438, 0f00000000; mul.f32 %f5547, %f3436, %f3439; setp.lt.f32 %p519, %f472, 0f3F800000; @%p519 bra $L__BB0_386; add.f32 %f3440, %f472, 0fBF800000; ld.global.f32 %f3441, [%rd67+16]; mul.f32 %f3442, %f470, %f3441; mul.f32 %f3443, %f3442, 0f3F000000; mul.f32 %f3444, %f3440, %f3443; fma.rn.f32 %f5547, %f3440, %f3444, %f5547; $L__BB0_386: mov.b32 %f3445, %r1794; max.f32 %f495, %f3445, %f5547; ld.global.u32 %r217, [%rd67+80]; setp.eq.s32 %p520, %r217, 2; add.u64 %rd228, %SPL, 0; @%p520 bra $L__BB0_468; mov.b32 %f496, %r10; and.b16 %rs29, %rs5, 3; setp.eq.s16 %p521, %rs29, 1; @%p521 bra $L__BB0_408; setp.eq.s16 %p522, %rs29, 2; @%p522 bra $L__BB0_391; setp.ne.s16 %p523, %rs29, 3; @%p523 bra $L__BB0_423; mov.u64 %rd1375, 0; mov.u64 %rd1376, %rd1375; bra.uni $L__BB0_455; $L__BB0_391: mov.f32 %f5344, 0f3102E308; mov.f32 %f5343, 0fBF317218; mov.f32 %f5342, 0f3FB8AA3B; mov.f32 %f5341, 0f35BFBE8E; mov.f32 %f5340, 0f3F317200; mov.f32 %f5339, 0f3DAAAABD; mov.f32 %f5338, 0f3C4CAF63; mov.f32 %f5337, 0f3B18F0FE; ld.global.f32 %f497, [%rd67+8]; div.rn.f32 %f3449, %f440, %f5546; div.rn.f32 %f498, %f3449, %f440; ld.global.u32 %r218, [%rd67+12]; cvt.rn.f32.s32 %f499, %r218; mul.f32 %f3450, %f499, 0f3F000000; cvt.rzi.f32.f32 %f3451, %f3450; add.f32 %f3452, %f3451, %f3451; sub.f32 %f3453, %f499, %f3452; abs.f32 %f500, %f3453; abs.f32 %f501, %f498; setp.lt.f32 %p524, %f501, 0f00800000; mul.f32 %f3454, %f501, 0f4B800000; selp.f32 %f3455, %f3454, %f501, %p524; selp.f32 %f3456, 0fC3170000, 0fC2FE0000, %p524; mov.b32 %r1201, %f3455; and.b32 %r1202, %r1201, 8388607; or.b32 %r1203, %r1202, 1065353216; mov.b32 %f3457, %r1203; shr.u32 %r1204, %r1201, 23; cvt.rn.f32.u32 %f3458, %r1204; add.f32 %f3459, %f3456, %f3458; setp.gt.f32 %p525, %f3457, 0f3FB504F3; mul.f32 %f3460, %f3457, 0f3F000000; add.f32 %f3461, %f3459, 0f3F800000; selp.f32 %f3462, %f3461, %f3459, %p525; selp.f32 %f3463, %f3460, %f3457, %p525; add.f32 %f3464, %f3463, 0fBF800000; add.f32 %f3447, %f3463, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f3446,%f3447; // end inline asm add.f32 %f3465, %f3464, %f3464; mul.f32 %f3466, %f3446, %f3465; mul.f32 %f3467, %f3466, %f3466; fma.rn.f32 %f3470, %f5337, %f3467, %f5338; fma.rn.f32 %f3472, %f3470, %f3467, %f5339; mul.rn.f32 %f3473, %f3472, %f3467; mul.rn.f32 %f3474, %f3473, %f3466; sub.f32 %f3475, %f3464, %f3466; add.f32 %f3476, %f3475, %f3475; neg.f32 %f3477, %f3466; fma.rn.f32 %f3478, %f3477, %f3464, %f3476; mul.rn.f32 %f3479, %f3446, %f3478; add.f32 %f3480, %f3474, %f3466; sub.f32 %f3481, %f3466, %f3480; add.f32 %f3482, %f3474, %f3481; add.f32 %f3483, %f3479, %f3482; add.f32 %f3484, %f3480, %f3483; sub.f32 %f3485, %f3480, %f3484; add.f32 %f3486, %f3483, %f3485; mul.rn.f32 %f3488, %f3462, %f5340; mul.rn.f32 %f3490, %f3462, %f5341; add.f32 %f3491, %f3488, %f3484; sub.f32 %f3492, %f3488, %f3491; add.f32 %f3493, %f3484, %f3492; add.f32 %f3494, %f3486, %f3493; add.f32 %f3495, %f3490, %f3494; add.f32 %f3496, %f3491, %f3495; sub.f32 %f3497, %f3491, %f3496; add.f32 %f3498, %f3495, %f3497; abs.f32 %f502, %f499; setp.gt.f32 %p526, %f502, 0f77F684DF; mul.f32 %f3499, %f499, 0f39000000; selp.f32 %f3500, %f3499, %f499, %p526; mul.rn.f32 %f3501, %f3500, %f3496; neg.f32 %f3502, %f3501; fma.rn.f32 %f3503, %f3500, %f3496, %f3502; fma.rn.f32 %f3504, %f3500, %f3498, %f3503; mov.f32 %f3505, 0f00000000; fma.rn.f32 %f3506, %f3505, %f3496, %f3504; add.rn.f32 %f3507, %f3501, %f3506; neg.f32 %f3508, %f3507; add.rn.f32 %f3509, %f3501, %f3508; add.rn.f32 %f3510, %f3509, %f3506; mov.b32 %r1205, %f3507; setp.eq.s32 %p527, %r1205, 1118925336; add.s32 %r1206, %r1205, -1; mov.b32 %f3511, %r1206; add.f32 %f3512, %f3510, 0f37000000; selp.f32 %f503, %f3512, %f3510, %p527; selp.f32 %f3513, %f3511, %f3507, %p527; mul.rn.f32 %f3515, %f3513, %f5342; cvt.rzi.f32.f32 %f3516, %f3515; abs.f32 %f3517, %f3516; setp.gt.f32 %p528, %f3517, 0f42FC0000; mov.b32 %r1207, %f3516; and.b32 %r1208, %r1207, -2147483648; or.b32 %r1209, %r1208, 1123811328; mov.b32 %f3518, %r1209; selp.f32 %f3519, %f3518, %f3516, %p528; fma.rn.f32 %f3521, %f3519, %f5343, %f3513; fma.rn.f32 %f3523, %f3519, %f5344, %f3521; mul.f32 %f3524, %f3523, 0f3FB8AA3B; add.f32 %f3525, %f3519, 0f4B40007F; mov.b32 %r1210, %f3525; shl.b32 %r1211, %r1210, 23; mov.b32 %f3526, %r1211; ex2.approx.ftz.f32 %f3527, %f3524; mul.f32 %f504, %f3527, %f3526; setp.eq.f32 %p529, %f504, 0f7F800000; mov.f32 %f5548, 0f7F800000; @%p529 bra $L__BB0_393; fma.rn.f32 %f5548, %f504, %f503, %f504; $L__BB0_393: setp.lt.f32 %p530, %f498, 0f00000000; setp.eq.f32 %p531, %f500, 0f3F800000; and.pred %p12, %p530, %p531; setp.eq.f32 %p532, %f498, 0f00000000; @%p532 bra $L__BB0_397; bra.uni $L__BB0_394; $L__BB0_397: add.f32 %f3531, %f498, %f498; mov.b32 %r1214, %f3531; selp.b32 %r1215, %r1214, 0, %p531; or.b32 %r1216, %r1215, 2139095040; setp.lt.s32 %p536, %r218, 0; selp.b32 %r1217, %r1216, %r1215, %p536; mov.b32 %f5550, %r1217; bra.uni $L__BB0_398; $L__BB0_408: mov.f32 %f5352, 0f3102E308; mov.f32 %f5351, 0fBF317218; mov.f32 %f5350, 0f3FB8AA3B; mov.f32 %f5349, 0f35BFBE8E; mov.f32 %f5348, 0f3F317200; mov.f32 %f5347, 0f3DAAAABD; mov.f32 %f5346, 0f3C4CAF63; mov.f32 %f5345, 0f3B18F0FE; ld.global.u64 %rd868, [%rd67+24]; mul.wide.u32 %rd869, %r8, 16; add.s64 %rd870, %rd868, %rd869; ld.f32 %f3575, [%rd870+8]; mul.f32 %f3577, %f5546, %f5635; sub.f32 %f532, %f3577, %f438; ld.global.f32 %f3578, [%rd67+16]; mul.f32 %f3579, %f3578, 0f3F2AAAAB; ld.global.f32 %f3580, [%rd67+12]; mul.f32 %f3581, %f3575, %f3580; fma.rn.f32 %f533, %f3575, %f3579, %f3581; mul.f32 %f537, %f3575, %f3578; mov.f32 %f3588, 0fBF800000; abs.f32 %f539, %f532; setp.lt.f32 %p551, %f539, 0f00800000; mul.f32 %f3590, %f539, 0f4B800000; selp.f32 %f3591, %f3590, %f539, %p551; selp.f32 %f3592, 0fC3170000, 0fC2FE0000, %p551; mov.b32 %r1230, %f3591; and.b32 %r1231, %r1230, 8388607; or.b32 %r1232, %r1231, 1065353216; mov.b32 %f3593, %r1232; shr.u32 %r1233, %r1230, 23; cvt.rn.f32.u32 %f3594, %r1233; add.f32 %f3595, %f3592, %f3594; setp.gt.f32 %p552, %f3593, 0f3FB504F3; mul.f32 %f3596, %f3593, 0f3F000000; add.f32 %f3597, %f3595, 0f3F800000; selp.f32 %f3598, %f3597, %f3595, %p552; selp.f32 %f3599, %f3596, %f3593, %p552; add.f32 %f3600, %f3599, 0fBF800000; add.f32 %f3573, %f3599, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f3572,%f3573; // end inline asm add.f32 %f3601, %f3600, %f3600; mul.f32 %f3602, %f3572, %f3601; mul.f32 %f3603, %f3602, %f3602; fma.rn.f32 %f3606, %f5345, %f3603, %f5346; fma.rn.f32 %f3608, %f3606, %f3603, %f5347; mul.rn.f32 %f3609, %f3608, %f3603; mul.rn.f32 %f3610, %f3609, %f3602; sub.f32 %f3611, %f3600, %f3602; add.f32 %f3612, %f3611, %f3611; neg.f32 %f3613, %f3602; fma.rn.f32 %f3614, %f3613, %f3600, %f3612; mul.rn.f32 %f3615, %f3572, %f3614; add.f32 %f3616, %f3610, %f3602; sub.f32 %f3617, %f3602, %f3616; add.f32 %f3618, %f3610, %f3617; add.f32 %f3619, %f3615, %f3618; add.f32 %f3620, %f3616, %f3619; sub.f32 %f3621, %f3616, %f3620; add.f32 %f3622, %f3619, %f3621; mul.rn.f32 %f3624, %f3598, %f5348; mul.rn.f32 %f3626, %f3598, %f5349; add.f32 %f3627, %f3624, %f3620; sub.f32 %f3628, %f3624, %f3627; add.f32 %f3629, %f3620, %f3628; add.f32 %f3630, %f3622, %f3629; add.f32 %f3631, %f3626, %f3630; add.f32 %f3632, %f3627, %f3631; sub.f32 %f3633, %f3627, %f3632; add.f32 %f3634, %f3631, %f3633; mul.rn.f32 %f3635, %f3588, %f3632; neg.f32 %f3636, %f3635; fma.rn.f32 %f3637, %f3588, %f3632, %f3636; fma.rn.f32 %f3638, %f3588, %f3634, %f3637; mov.f32 %f3639, 0f00000000; fma.rn.f32 %f3640, %f3639, %f3632, %f3638; add.rn.f32 %f3641, %f3635, %f3640; neg.f32 %f3642, %f3641; add.rn.f32 %f3643, %f3635, %f3642; add.rn.f32 %f3644, %f3643, %f3640; mov.b32 %r1234, %f3641; setp.eq.s32 %p553, %r1234, 1118925336; add.s32 %r1235, %r1234, -1; mov.b32 %f3645, %r1235; add.f32 %f3646, %f3644, 0f37000000; selp.f32 %f540, %f3646, %f3644, %p553; selp.f32 %f3647, %f3645, %f3641, %p553; mul.rn.f32 %f3649, %f3647, %f5350; cvt.rzi.f32.f32 %f3650, %f3649; abs.f32 %f3651, %f3650; setp.gt.f32 %p554, %f3651, 0f42FC0000; mov.b32 %r1236, %f3650; and.b32 %r1237, %r1236, -2147483648; or.b32 %r1238, %r1237, 1123811328; mov.b32 %f3652, %r1238; selp.f32 %f3653, %f3652, %f3650, %p554; fma.rn.f32 %f3655, %f3653, %f5351, %f3647; fma.rn.f32 %f3657, %f3653, %f5352, %f3655; mul.f32 %f3658, %f3657, 0f3FB8AA3B; add.f32 %f3659, %f3653, 0f4B40007F; mov.b32 %r1239, %f3659; shl.b32 %r1240, %r1239, 23; mov.b32 %f3660, %r1240; ex2.approx.ftz.f32 %f3661, %f3658; mul.f32 %f541, %f3661, %f3660; setp.eq.f32 %p555, %f541, 0f7F800000; mov.f32 %f5555, 0f7F800000; @%p555 bra $L__BB0_410; fma.rn.f32 %f5555, %f541, %f540, %f541; $L__BB0_410: mov.f32 %f5321, 0fBF000000; cvt.rzi.f32.f32 %f5320, %f5321; add.f32 %f5319, %f5320, %f5320; mov.f32 %f5318, 0fBF800000; sub.f32 %f5317, %f5318, %f5319; abs.f32 %f5316, %f5317; setp.lt.f32 %p556, %f532, 0f00000000; setp.eq.f32 %p557, %f5316, 0f3F800000; and.pred %p13, %p556, %p557; setp.eq.f32 %p558, %f532, 0f00000000; @%p558 bra $L__BB0_414; bra.uni $L__BB0_411; $L__BB0_414: add.f32 %f3666, %f532, %f532; mov.b32 %r1243, %f3666; or.b32 %r1244, %r1243, 2139095040; mov.b32 %f3667, %r1244; selp.f32 %f5557, %f3667, 0f7F800000, %p557; bra.uni $L__BB0_415; $L__BB0_423: add.u64 %rd1294, %SPL, 80; ld.global.u64 %rd876, [%rd67+24]; mul.wide.u32 %rd877, %r8, 16; add.s64 %rd878, %rd876, %rd877; ld.f32 %f577, [%rd878+8]; mul.f32 %f3696, %f5546, %f5635; sub.f32 %f578, %f3696, %f438; ld.local.v4.f32 {%f5546, %f3698, %f3699, %f3700}, [%rd1294]; add.f32 %f3702, %f3700, %f5546; mul.f32 %f580, %f3702, 0f3F000000; sub.f32 %f3703, %f5546, %f3700; mul.f32 %f3704, %f3703, 0f3F000000; add.f32 %f3707, %f3698, %f3699; mul.f32 %f3708, %f3707, 0f3F000000; sub.f32 %f3709, %f3698, %f3699; mul.f32 %f581, %f3709, 0f3F000000; mul.f32 %f3710, %f581, %f581; fma.rn.f32 %f3711, %f580, %f580, %f3710; sqrt.rn.f32 %f3712, %f3711; mul.f32 %f3713, %f3708, %f3708; fma.rn.f32 %f3714, %f3704, %f3704, %f3713; sqrt.rn.f32 %f3715, %f3714; add.f32 %f582, %f3712, %f3715; sub.f32 %f583, %f3712, %f3715; abs.f32 %f584, %f3704; abs.f32 %f585, %f3708; setp.eq.f32 %p567, %f584, 0f00000000; setp.eq.f32 %p568, %f585, 0f00000000; and.pred %p569, %p567, %p568; mov.b32 %r219, %f3704; mov.b32 %r1251, %f3708; and.b32 %r220, %r1251, -2147483648; @%p569 bra $L__BB0_427; bra.uni $L__BB0_424; $L__BB0_427: shr.s32 %r1256, %r219, 31; and.b32 %r1257, %r1256, 1078530011; or.b32 %r1258, %r1257, %r220; mov.b32 %f5566, %r1258; bra.uni $L__BB0_428; $L__BB0_424: setp.eq.f32 %p570, %f584, 0f7F800000; setp.eq.f32 %p571, %f585, 0f7F800000; and.pred %p572, %p570, %p571; @%p572 bra $L__BB0_426; bra.uni $L__BB0_425; $L__BB0_426: setp.lt.s32 %p576, %r219, 0; selp.b32 %r1254, 1075235812, 1061752795, %p576; or.b32 %r1255, %r1254, %r220; mov.b32 %f5566, %r1255; bra.uni $L__BB0_428; $L__BB0_202: mov.b32 %r103, %f231; bfe.u32 %r750, %r103, 23, 8; add.s32 %r104, %r750, -128; shl.b32 %r751, %r103, 8; or.b32 %r105, %r751, -2147483648; shr.u32 %r106, %r104, 5; add.u64 %rd717, %SP, 32; add.u64 %rd1340, %SPL, 32; mov.u32 %r1743, 0; mov.u64 %rd1339, __cudart_i2opi_f; mov.u32 %r1744, %r1743; $L__BB0_203: .pragma "nounroll"; mov.u32 %r108, %r1744; ld.global.nc.u32 %r754, [%rd1339]; // begin inline asm { mad.lo.cc.u32 %r752, %r754, %r105, %r108; madc.hi.u32 %r1744, %r754, %r105, 0; } // end inline asm st.local.u32 [%rd1340], %r752; add.s64 %rd1340, %rd1340, 4; add.s64 %rd1339, %rd1339, 4; add.s32 %r1743, %r1743, 1; setp.ne.s32 %p289, %r1743, 6; @%p289 bra $L__BB0_203; mov.u32 %r759, -1560706194; // begin inline asm { mad.lo.cc.u32 %r757, %r759, %r105, %r108; madc.hi.u32 %r758, %r759, %r105, 0; } // end inline asm st.local.u32 [%rd94], %r758; mov.u32 %r762, 4; sub.s32 %r111, %r762, %r106; mov.u32 %r763, 6; sub.s32 %r764, %r763, %r106; cvta.to.local.u64 %rd719, %rd717; mul.wide.s32 %rd720, %r764, 4; add.s64 %rd721, %rd719, %rd720; ld.local.u32 %r1745, [%rd721]; ld.local.u32 %r1746, [%rd721+-4]; and.b32 %r114, %r104, 31; setp.eq.s32 %p290, %r114, 0; @%p290 bra $L__BB0_206; mov.u32 %r765, 32; sub.s32 %r766, %r765, %r114; shr.u32 %r767, %r1746, %r766; shl.b32 %r768, %r1745, %r114; add.s32 %r1745, %r767, %r768; mul.wide.s32 %rd724, %r111, 4; add.s64 %rd725, %rd719, %rd724; ld.local.u32 %r769, [%rd725]; shr.u32 %r770, %r769, %r766; shl.b32 %r771, %r1746, %r114; add.s32 %r1746, %r770, %r771; $L__BB0_206: and.b32 %r772, %r103, -2147483648; shr.u32 %r773, %r1746, 30; shl.b32 %r774, %r1745, 2; or.b32 %r775, %r773, %r774; shr.u32 %r776, %r775, 31; shr.u32 %r777, %r1745, 30; add.s32 %r778, %r776, %r777; neg.s32 %r779, %r778; setp.eq.s32 %p291, %r772, 0; selp.b32 %r1747, %r778, %r779, %p291; setp.ne.s32 %p292, %r776, 0; xor.b32 %r780, %r772, -2147483648; selp.b32 %r781, %r780, %r772, %p292; selp.b32 %r782, -1, 0, %p292; xor.b32 %r783, %r775, %r782; shl.b32 %r784, %r1746, 2; xor.b32 %r785, %r784, %r782; cvt.u64.u32 %rd726, %r783; cvt.u64.u32 %rd727, %r785; bfi.b64 %rd728, %rd726, %rd727, 32, 32; cvt.rn.f64.s64 %fd9, %rd728; mul.f64 %fd10, %fd9, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f2222, %fd10; setp.eq.s32 %p293, %r781, 0; neg.f32 %f2223, %f2222; selp.f32 %f5496, %f2222, %f2223, %p293; $L__BB0_208: mul.f32 %f2225, %f232, 0f3F22F983; cvt.rni.s32.f32 %r1752, %f2225; cvt.rn.f32.s32 %f2226, %r1752; fma.rn.f32 %f2228, %f2226, %f2217, %f232; fma.rn.f32 %f2230, %f2226, %f2219, %f2228; fma.rn.f32 %f5497, %f2226, %f2221, %f2230; abs.f32 %f239, %f232; setp.leu.f32 %p294, %f239, 0f47CE4780; @%p294 bra $L__BB0_216; setp.eq.f32 %p295, %f239, 0f7F800000; @%p295 bra $L__BB0_215; bra.uni $L__BB0_210; $L__BB0_215: mul.rn.f32 %f5497, %f232, %f1058; bra.uni $L__BB0_216; $L__BB0_210: mov.b32 %r122, %f232; bfe.u32 %r788, %r122, 23, 8; add.s32 %r123, %r788, -128; shl.b32 %r789, %r122, 8; or.b32 %r124, %r789, -2147483648; shr.u32 %r125, %r123, 5; add.u64 %rd730, %SP, 32; add.u64 %rd1342, %SPL, 32; mov.u32 %r1748, 0; mov.u64 %rd1341, __cudart_i2opi_f; mov.u32 %r1749, %r1748; $L__BB0_211: .pragma "nounroll"; mov.u32 %r127, %r1749; ld.global.nc.u32 %r792, [%rd1341]; // begin inline asm { mad.lo.cc.u32 %r790, %r792, %r124, %r127; madc.hi.u32 %r1749, %r792, %r124, 0; } // end inline asm st.local.u32 [%rd1342], %r790; add.s64 %rd1342, %rd1342, 4; add.s64 %rd1341, %rd1341, 4; add.s32 %r1748, %r1748, 1; setp.ne.s32 %p296, %r1748, 6; @%p296 bra $L__BB0_211; mov.u32 %r797, -1560706194; // begin inline asm { mad.lo.cc.u32 %r795, %r797, %r124, %r127; madc.hi.u32 %r796, %r797, %r124, 0; } // end inline asm st.local.u32 [%rd94], %r796; mov.u32 %r800, 4; sub.s32 %r130, %r800, %r125; mov.u32 %r801, 6; sub.s32 %r802, %r801, %r125; cvta.to.local.u64 %rd732, %rd730; mul.wide.s32 %rd733, %r802, 4; add.s64 %rd734, %rd732, %rd733; ld.local.u32 %r1750, [%rd734]; ld.local.u32 %r1751, [%rd734+-4]; and.b32 %r133, %r123, 31; setp.eq.s32 %p297, %r133, 0; @%p297 bra $L__BB0_214; mov.u32 %r803, 32; sub.s32 %r804, %r803, %r133; shr.u32 %r805, %r1751, %r804; shl.b32 %r806, %r1750, %r133; add.s32 %r1750, %r805, %r806; mul.wide.s32 %rd737, %r130, 4; add.s64 %rd738, %rd732, %rd737; ld.local.u32 %r807, [%rd738]; shr.u32 %r808, %r807, %r804; shl.b32 %r809, %r1751, %r133; add.s32 %r1751, %r808, %r809; $L__BB0_214: and.b32 %r810, %r122, -2147483648; shr.u32 %r811, %r1751, 30; shl.b32 %r812, %r1750, 2; or.b32 %r813, %r811, %r812; shr.u32 %r814, %r813, 31; shr.u32 %r815, %r1750, 30; add.s32 %r816, %r814, %r815; neg.s32 %r817, %r816; setp.eq.s32 %p298, %r810, 0; selp.b32 %r1752, %r816, %r817, %p298; setp.ne.s32 %p299, %r814, 0; xor.b32 %r818, %r810, -2147483648; selp.b32 %r819, %r818, %r810, %p299; selp.b32 %r820, -1, 0, %p299; xor.b32 %r821, %r813, %r820; shl.b32 %r822, %r1751, 2; xor.b32 %r823, %r822, %r820; cvt.u64.u32 %rd739, %r821; cvt.u64.u32 %rd740, %r823; bfi.b64 %rd741, %rd739, %rd740, 32, 32; cvt.rn.f64.s64 %fd11, %rd741; mul.f64 %fd12, %fd11, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f2232, %fd12; setp.eq.s32 %p300, %r819, 0; neg.f32 %f2233, %f2232; selp.f32 %f5497, %f2232, %f2233, %p300; $L__BB0_216: mov.f32 %f5195, 0f3F800000; mul.f32 %f2235, %f5496, %f5496; mov.f32 %f2236, 0fBAB607ED; mov.f32 %f2237, 0f37CBAC00; fma.rn.f32 %f2238, %f2237, %f2235, %f2236; mov.f32 %f2239, 0f3D2AAABB; fma.rn.f32 %f2240, %f2238, %f2235, %f2239; mov.f32 %f2241, 0fBEFFFFFF; fma.rn.f32 %f2242, %f2240, %f2235, %f2241; fma.rn.f32 %f2244, %f2242, %f2235, %f5195; mov.f32 %f2245, 0f3C0885E4; mov.f32 %f2246, 0fB94D4153; fma.rn.f32 %f2247, %f2246, %f2235, %f2245; mov.f32 %f2248, 0fBE2AAAA8; fma.rn.f32 %f2249, %f2247, %f2235, %f2248; fma.rn.f32 %f2251, %f2235, %f5496, %f1058; fma.rn.f32 %f2252, %f2249, %f2251, %f5496; and.b32 %r824, %r1747, 1; setp.eq.b32 %p301, %r824, 1; selp.f32 %f2253, %f2244, %f2252, %p301; selp.f32 %f2254, %f2252, %f2244, %p301; neg.f32 %f2255, %f2253; and.b32 %r825, %r1747, 2; setp.eq.s32 %p302, %r825, 0; selp.f32 %f2256, %f2253, %f2255, %p302; neg.f32 %f2257, %f2254; add.s32 %r826, %r1747, 1; and.b32 %r827, %r826, 2; setp.eq.s32 %p303, %r827, 0; selp.f32 %f2258, %f2254, %f2257, %p303; mul.f32 %f2259, %f5497, %f5497; fma.rn.f32 %f2260, %f2237, %f2259, %f2236; fma.rn.f32 %f2261, %f2260, %f2259, %f2239; fma.rn.f32 %f2262, %f2261, %f2259, %f2241; fma.rn.f32 %f2263, %f2262, %f2259, %f5195; fma.rn.f32 %f2264, %f2259, %f5497, %f1058; fma.rn.f32 %f2265, %f2246, %f2259, %f2245; fma.rn.f32 %f2266, %f2265, %f2259, %f2248; fma.rn.f32 %f2267, %f2266, %f2264, %f5497; and.b32 %r828, %r1752, 1; setp.eq.b32 %p304, %r828, 1; selp.f32 %f2268, %f2263, %f2267, %p304; selp.f32 %f2269, %f2267, %f2263, %p304; and.b32 %r829, %r1752, 2; setp.eq.s32 %p305, %r829, 0; neg.f32 %f2270, %f2268; selp.f32 %f2271, %f2268, %f2270, %p305; add.s32 %r830, %r1752, 1; and.b32 %r831, %r830, 2; setp.eq.s32 %p306, %r831, 0; neg.f32 %f2272, %f2269; selp.f32 %f2273, %f2269, %f2272, %p306; mov.b32 %r832, %f2273; neg.f32 %f2274, %f2271; mov.b32 %r833, %f2271; cvt.u64.u32 %rd742, %r833; cvt.u64.u32 %rd743, %r832; bfi.b64 %rd144, %rd742, %rd743, 32, 32; mov.b32 %r834, %f2274; cvt.u64.u32 %rd744, %r834; bfi.b64 %rd145, %rd743, %rd744, 32, 32; mul.f32 %f2275, %f217, %f2256; mov.b32 %r835, %f2275; cvt.u64.u32 %rd745, %r835; mov.b32 %r836, %f2258; cvt.u64.u32 %rd746, %r836; bfi.b64 %rd146, %rd745, %rd746, 32, 32; neg.f32 %f2276, %f2256; mov.b32 %r837, %f2276; mul.f32 %f2277, %f217, %f2258; mov.b32 %r838, %f2277; cvt.u64.u32 %rd747, %r838; cvt.u64.u32 %rd748, %r837; bfi.b64 %rd147, %rd747, %rd748, 32, 32; mul.f32 %f243, %f216, %f216; add.f32 %f2278, %f243, 0f00000000; mul.f32 %f244, %f218, %f218; add.f32 %f245, %f2278, %f244; ld.global.f32 %f246, [%rd67+44]; neg.f32 %f2279, %f5517; max.f32 %f2280, %f2279, %f1058; mul.f32 %f247, %f211, %f2280; abs.f32 %f248, %f247; setp.ltu.f32 %p307, %f248, 0f3F800000; @%p307 bra $L__BB0_218; bra.uni $L__BB0_217; $L__BB0_218: mul.f32 %f2302, %f247, %f247; mov.f32 %f2303, 0f394FFF49; mov.f32 %f2304, 0f363D0ADA; fma.rn.f32 %f2305, %f2304, %f2302, %f2303; mov.f32 %f2306, 0f3C08889A; fma.rn.f32 %f2307, %f2305, %f2302, %f2306; mov.f32 %f2308, 0f3E2AAAAB; fma.rn.f32 %f2309, %f2307, %f2302, %f2308; mul.f32 %f2310, %f2302, %f2309; fma.rn.f32 %f5498, %f2310, %f247, %f247; bra.uni $L__BB0_219; $L__BB0_217: mov.f32 %f5399, 0f3102E308; mov.f32 %f5398, 0fBF317218; mov.f32 %f5397, 0f3FB8AA3B; mul.rn.f32 %f2282, %f248, %f5397; cvt.rzi.f32.f32 %f2283, %f2282; abs.f32 %f2284, %f2283; setp.gt.f32 %p308, %f2284, 0f42FC0000; mov.b32 %r839, %f2283; and.b32 %r840, %r839, -2147483648; or.b32 %r841, %r840, 1123811328; mov.b32 %f2285, %r841; selp.f32 %f2286, %f2285, %f2283, %p308; fma.rn.f32 %f2288, %f2286, %f5398, %f248; fma.rn.f32 %f2290, %f2286, %f5399, %f2288; mul.f32 %f2291, %f2290, 0f3FB8AA3B; add.f32 %f2292, %f2286, 0f4B40007D; mov.b32 %r842, %f2292; shl.b32 %r843, %r842, 23; mov.b32 %f2293, %r843; ex2.approx.ftz.f32 %f2294, %f2291; mul.f32 %f2295, %f2294, %f2293; mov.f32 %f2296, 0f3E000000; div.approx.f32 %f2297, %f2296, %f2295; neg.f32 %f2298, %f2297; fma.rn.f32 %f2300, %f1007, %f2295, %f2298; setp.ge.f32 %p309, %f248, 0f42B40000; selp.f32 %f2301, 0f7F800000, %f2300, %p309; mov.b32 %r844, %f2301; mov.b32 %r845, %f247; and.b32 %r846, %r845, -2147483648; or.b32 %r847, %r846, %r844; mov.b32 %f5498, %r847; $L__BB0_219: mov.f32 %f5407, 0f3102E308; mov.f32 %f5406, 0fBF317218; mov.f32 %f5405, 0f3FB8AA3B; mov.f32 %f5404, 0f35BFBE8E; mov.f32 %f5403, 0f3F317200; mov.f32 %f5402, 0f3DAAAABD; mov.f32 %f5401, 0f3C4CAF63; mov.f32 %f5400, 0f3B18F0FE; add.f32 %f2314, %f5498, 0f3727C5AC; mul.f32 %f252, %f246, %f2314; ld.global.f32 %f253, [%rd67+40]; mov.f32 %f2318, 0fBF800000; mul.f32 %f255, %f216, %f218; abs.f32 %f256, %f255; setp.lt.f32 %p310, %f256, 0f00800000; mul.f32 %f2320, %f256, 0f4B800000; selp.f32 %f2321, %f2320, %f256, %p310; selp.f32 %f2322, 0fC3170000, 0fC2FE0000, %p310; mov.b32 %r848, %f2321; and.b32 %r849, %r848, 8388607; or.b32 %r850, %r849, 1065353216; mov.b32 %f2323, %r850; shr.u32 %r851, %r848, 23; cvt.rn.f32.u32 %f2324, %r851; add.f32 %f2325, %f2322, %f2324; setp.gt.f32 %p311, %f2323, 0f3FB504F3; mul.f32 %f2326, %f2323, 0f3F000000; add.f32 %f2327, %f2325, 0f3F800000; selp.f32 %f2328, %f2327, %f2325, %p311; selp.f32 %f2329, %f2326, %f2323, %p311; add.f32 %f2330, %f2329, 0fBF800000; add.f32 %f2312, %f2329, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f2311,%f2312; // end inline asm add.f32 %f2331, %f2330, %f2330; mul.f32 %f2332, %f2311, %f2331; mul.f32 %f2333, %f2332, %f2332; fma.rn.f32 %f2336, %f5400, %f2333, %f5401; fma.rn.f32 %f2338, %f2336, %f2333, %f5402; mul.rn.f32 %f2339, %f2338, %f2333; mul.rn.f32 %f2340, %f2339, %f2332; sub.f32 %f2341, %f2330, %f2332; add.f32 %f2342, %f2341, %f2341; neg.f32 %f2343, %f2332; fma.rn.f32 %f2344, %f2343, %f2330, %f2342; mul.rn.f32 %f2345, %f2311, %f2344; add.f32 %f2346, %f2340, %f2332; sub.f32 %f2347, %f2332, %f2346; add.f32 %f2348, %f2340, %f2347; add.f32 %f2349, %f2345, %f2348; add.f32 %f2350, %f2346, %f2349; sub.f32 %f2351, %f2346, %f2350; add.f32 %f2352, %f2349, %f2351; mul.rn.f32 %f2354, %f2328, %f5403; mul.rn.f32 %f2356, %f2328, %f5404; add.f32 %f2357, %f2354, %f2350; sub.f32 %f2358, %f2354, %f2357; add.f32 %f2359, %f2350, %f2358; add.f32 %f2360, %f2352, %f2359; add.f32 %f2361, %f2356, %f2360; add.f32 %f257, %f2357, %f2361; sub.f32 %f2362, %f2357, %f257; add.f32 %f258, %f2361, %f2362; mul.rn.f32 %f2363, %f2318, %f257; neg.f32 %f2364, %f2363; fma.rn.f32 %f2365, %f2318, %f257, %f2364; fma.rn.f32 %f2366, %f2318, %f258, %f2365; fma.rn.f32 %f2368, %f1058, %f257, %f2366; add.rn.f32 %f2369, %f2363, %f2368; neg.f32 %f2370, %f2369; add.rn.f32 %f2371, %f2363, %f2370; add.rn.f32 %f2372, %f2371, %f2368; mov.b32 %r852, %f2369; setp.eq.s32 %p312, %r852, 1118925336; add.s32 %r853, %r852, -1; mov.b32 %f2373, %r853; add.f32 %f2374, %f2372, 0f37000000; selp.f32 %f259, %f2374, %f2372, %p312; selp.f32 %f2375, %f2373, %f2369, %p312; mul.rn.f32 %f2377, %f2375, %f5405; cvt.rzi.f32.f32 %f2378, %f2377; abs.f32 %f2379, %f2378; setp.gt.f32 %p313, %f2379, 0f42FC0000; mov.b32 %r854, %f2378; and.b32 %r855, %r854, -2147483648; or.b32 %r856, %r855, 1123811328; mov.b32 %f2380, %r856; selp.f32 %f2381, %f2380, %f2378, %p313; fma.rn.f32 %f2383, %f2381, %f5406, %f2375; fma.rn.f32 %f2385, %f2381, %f5407, %f2383; mul.f32 %f2386, %f2385, 0f3FB8AA3B; add.f32 %f2387, %f2381, 0f4B40007F; mov.b32 %r857, %f2387; shl.b32 %r858, %r857, 23; mov.b32 %f2388, %r858; ex2.approx.ftz.f32 %f2389, %f2386; mul.f32 %f260, %f2389, %f2388; setp.eq.f32 %p314, %f260, 0f7F800000; mov.f32 %f5499, 0f7F800000; @%p314 bra $L__BB0_221; fma.rn.f32 %f5499, %f260, %f259, %f260; $L__BB0_221: mov.f32 %f5206, 0fBF000000; cvt.rzi.f32.f32 %f5205, %f5206; add.f32 %f5204, %f5205, %f5205; mov.f32 %f5203, 0fBF800000; sub.f32 %f5202, %f5203, %f5204; abs.f32 %f5201, %f5202; setp.lt.f32 %p315, %f255, 0f00000000; setp.eq.f32 %p316, %f5201, 0f3F800000; and.pred %p7, %p315, %p316; setp.eq.f32 %p317, %f255, 0f00000000; @%p317 bra $L__BB0_225; bra.uni $L__BB0_222; $L__BB0_225: add.f32 %f2394, %f255, %f255; mov.b32 %r861, %f2394; or.b32 %r862, %r861, 2139095040; mov.b32 %f2395, %r862; selp.f32 %f5501, %f2395, 0f7F800000, %p316; bra.uni $L__BB0_226; $L__BB0_222: mov.b32 %r859, %f5499; xor.b32 %r860, %r859, -2147483648; mov.b32 %f2390, %r860; selp.f32 %f5501, %f2390, %f5499, %p7; setp.geu.f32 %p318, %f255, 0f00000000; @%p318 bra $L__BB0_226; mov.f32 %f5181, 0fBF800000; cvt.rzi.f32.f32 %f2392, %f5181; setp.eq.f32 %p319, %f2392, 0fBF800000; @%p319 bra $L__BB0_226; mov.f32 %f5501, 0f7FFFFFFF; $L__BB0_226: abs.f32 %f5207, %f255; add.f32 %f2396, %f5207, 0f3F800000; mov.b32 %r140, %f2396; setp.lt.s32 %p321, %r140, 2139095040; @%p321 bra $L__BB0_231; abs.f32 %f5225, %f255; setp.gtu.f32 %p322, %f5225, 0f7F800000; @%p322 bra $L__BB0_230; bra.uni $L__BB0_228; $L__BB0_230: add.f32 %f5501, %f255, 0fBF800000; bra.uni $L__BB0_231; $L__BB0_228: abs.f32 %f5226, %f255; setp.neu.f32 %p323, %f5226, 0f7F800000; @%p323 bra $L__BB0_231; selp.f32 %f5501, 0f80000000, 0f00000000, %p7; $L__BB0_231: setp.eq.f32 %p324, %f255, 0f3F800000; selp.f32 %f2397, 0f3F800000, %f5501, %p324; mul.f32 %f2398, %f253, %f2397; mul.f32 %f269, %f245, 0f3F000000; sub.f32 %f2399, %f243, %f269; sub.f32 %f2400, %f244, %f269; mul.f32 %f270, %f2399, %f2398; mul.f32 %f271, %f2400, %f2398; rcp.rn.f32 %f2401, %f255; sub.f32 %f2402, %f255, %f2401; mul.f32 %f2403, %f246, 0f3F000000; mul.f32 %f2404, %f2402, %f2403; mul.f32 %f272, %f255, %f2404; neg.f32 %f273, %f272; setp.lt.f32 %p325, %f252, %f273; @%p325 bra $L__BB0_266; bra.uni $L__BB0_232; $L__BB0_266: mov.f32 %f5376, 0f3102E308; mov.f32 %f5375, 0fBF317218; mov.f32 %f5374, 0f3FB8AA3B; mov.f32 %f5373, 0f35BFBE8E; mov.f32 %f5372, 0f3F317200; mov.f32 %f5371, 0f3DAAAABD; mov.f32 %f5370, 0f3C4CAF63; mov.f32 %f5369, 0f3B18F0FE; mov.f32 %f5223, 0f00000000; mul.f32 %f2692, %f252, 0fC0000000; div.rn.f32 %f2693, %f2692, %f246; add.f32 %f2694, %f2693, 0f3F800000; sqrt.rn.f32 %f326, %f2694; mov.f32 %f2695, 0f3E800000; cvt.rzi.f32.f32 %f2696, %f2695; add.f32 %f2697, %f2696, %f2696; mov.f32 %f2698, 0f3F000000; sub.f32 %f2699, %f2698, %f2697; abs.f32 %f327, %f2699; abs.f32 %f328, %f326; setp.lt.f32 %p371, %f328, 0f00800000; mul.f32 %f2700, %f328, 0f4B800000; selp.f32 %f2701, %f2700, %f328, %p371; selp.f32 %f2702, 0fC3170000, 0fC2FE0000, %p371; mov.b32 %r918, %f2701; and.b32 %r919, %r918, 8388607; or.b32 %r920, %r919, 1065353216; mov.b32 %f2703, %r920; shr.u32 %r921, %r918, 23; cvt.rn.f32.u32 %f2704, %r921; add.f32 %f2705, %f2702, %f2704; setp.gt.f32 %p372, %f2703, 0f3FB504F3; mul.f32 %f2706, %f2703, 0f3F000000; add.f32 %f2707, %f2705, 0f3F800000; selp.f32 %f2708, %f2707, %f2705, %p372; selp.f32 %f2709, %f2706, %f2703, %p372; add.f32 %f2710, %f2709, 0fBF800000; add.f32 %f2690, %f2709, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f2689,%f2690; // end inline asm add.f32 %f2711, %f2710, %f2710; mul.f32 %f2712, %f2689, %f2711; mul.f32 %f2713, %f2712, %f2712; fma.rn.f32 %f2716, %f5369, %f2713, %f5370; fma.rn.f32 %f2718, %f2716, %f2713, %f5371; mul.rn.f32 %f2719, %f2718, %f2713; mul.rn.f32 %f2720, %f2719, %f2712; sub.f32 %f2721, %f2710, %f2712; add.f32 %f2722, %f2721, %f2721; neg.f32 %f2723, %f2712; fma.rn.f32 %f2724, %f2723, %f2710, %f2722; mul.rn.f32 %f2725, %f2689, %f2724; add.f32 %f2726, %f2720, %f2712; sub.f32 %f2727, %f2712, %f2726; add.f32 %f2728, %f2720, %f2727; add.f32 %f2729, %f2725, %f2728; add.f32 %f2730, %f2726, %f2729; sub.f32 %f2731, %f2726, %f2730; add.f32 %f2732, %f2729, %f2731; mul.rn.f32 %f2734, %f2708, %f5372; mul.rn.f32 %f2736, %f2708, %f5373; add.f32 %f2737, %f2734, %f2730; sub.f32 %f2738, %f2734, %f2737; add.f32 %f2739, %f2730, %f2738; add.f32 %f2740, %f2732, %f2739; add.f32 %f2741, %f2736, %f2740; add.f32 %f2742, %f2737, %f2741; sub.f32 %f2743, %f2737, %f2742; add.f32 %f2744, %f2741, %f2743; mul.rn.f32 %f2745, %f2698, %f2742; neg.f32 %f2746, %f2745; fma.rn.f32 %f2747, %f2698, %f2742, %f2746; fma.rn.f32 %f2748, %f2698, %f2744, %f2747; fma.rn.f32 %f2750, %f5223, %f2742, %f2748; add.rn.f32 %f2751, %f2745, %f2750; neg.f32 %f2752, %f2751; add.rn.f32 %f2753, %f2745, %f2752; add.rn.f32 %f2754, %f2753, %f2750; mov.b32 %r922, %f2751; setp.eq.s32 %p373, %r922, 1118925336; add.s32 %r923, %r922, -1; mov.b32 %f2755, %r923; add.f32 %f2756, %f2754, 0f37000000; selp.f32 %f329, %f2756, %f2754, %p373; selp.f32 %f2757, %f2755, %f2751, %p373; mul.rn.f32 %f2759, %f2757, %f5374; cvt.rzi.f32.f32 %f2760, %f2759; abs.f32 %f2761, %f2760; setp.gt.f32 %p374, %f2761, 0f42FC0000; mov.b32 %r924, %f2760; and.b32 %r925, %r924, -2147483648; or.b32 %r926, %r925, 1123811328; mov.b32 %f2762, %r926; selp.f32 %f2763, %f2762, %f2760, %p374; fma.rn.f32 %f2765, %f2763, %f5375, %f2757; fma.rn.f32 %f2767, %f2763, %f5376, %f2765; mul.f32 %f2768, %f2767, 0f3FB8AA3B; add.f32 %f2769, %f2763, 0f4B40007F; mov.b32 %r927, %f2769; shl.b32 %r928, %r927, 23; mov.b32 %f2770, %r928; ex2.approx.ftz.f32 %f2771, %f2768; mul.f32 %f330, %f2771, %f2770; setp.eq.f32 %p375, %f330, 0f7F800000; mov.f32 %f5512, 0f7F800000; @%p375 bra $L__BB0_268; fma.rn.f32 %f5512, %f330, %f329, %f330; $L__BB0_268: setp.lt.f32 %p376, %f326, 0f00000000; setp.eq.f32 %p377, %f327, 0f3F800000; and.pred %p10, %p376, %p377; setp.eq.f32 %p378, %f326, 0f00000000; @%p378 bra $L__BB0_272; bra.uni $L__BB0_269; $L__BB0_272: add.f32 %f2776, %f326, %f326; selp.f32 %f5514, %f2776, 0f00000000, %p377; bra.uni $L__BB0_273; $L__BB0_232: mul.f32 %f274, %f212, %f252; setp.gt.f32 %p326, %f272, %f274; add.f32 %f275, %f212, %f212; @%p326 bra $L__BB0_249; bra.uni $L__BB0_233; $L__BB0_249: mov.f32 %f5368, 0f3102E308; mov.f32 %f5367, 0fBF317218; mov.f32 %f5366, 0f3FB8AA3B; mov.f32 %f5365, 0f35BFBE8E; mov.f32 %f5364, 0f3F317200; mov.f32 %f5363, 0f3DAAAABD; mov.f32 %f5362, 0f3C4CAF63; mov.f32 %f5361, 0f3B18F0FE; mov.f32 %f5221, 0f00000000; mul.f32 %f2552, %f275, %f252; div.rn.f32 %f2553, %f2552, %f246; add.f32 %f2554, %f2553, 0f3F800000; sqrt.rn.f32 %f307, %f2554; mov.f32 %f2558, 0f3F000000; abs.f32 %f309, %f307; setp.lt.f32 %p352, %f309, 0f00800000; mul.f32 %f2560, %f309, 0f4B800000; selp.f32 %f2561, %f2560, %f309, %p352; selp.f32 %f2562, 0fC3170000, 0fC2FE0000, %p352; mov.b32 %r888, %f2561; and.b32 %r889, %r888, 8388607; or.b32 %r890, %r889, 1065353216; mov.b32 %f2563, %r890; shr.u32 %r891, %r888, 23; cvt.rn.f32.u32 %f2564, %r891; add.f32 %f2565, %f2562, %f2564; setp.gt.f32 %p353, %f2563, 0f3FB504F3; mul.f32 %f2566, %f2563, 0f3F000000; add.f32 %f2567, %f2565, 0f3F800000; selp.f32 %f2568, %f2567, %f2565, %p353; selp.f32 %f2569, %f2566, %f2563, %p353; add.f32 %f2570, %f2569, 0fBF800000; add.f32 %f2550, %f2569, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f2549,%f2550; // end inline asm add.f32 %f2571, %f2570, %f2570; mul.f32 %f2572, %f2549, %f2571; mul.f32 %f2573, %f2572, %f2572; fma.rn.f32 %f2576, %f5361, %f2573, %f5362; fma.rn.f32 %f2578, %f2576, %f2573, %f5363; mul.rn.f32 %f2579, %f2578, %f2573; mul.rn.f32 %f2580, %f2579, %f2572; sub.f32 %f2581, %f2570, %f2572; add.f32 %f2582, %f2581, %f2581; neg.f32 %f2583, %f2572; fma.rn.f32 %f2584, %f2583, %f2570, %f2582; mul.rn.f32 %f2585, %f2549, %f2584; add.f32 %f2586, %f2580, %f2572; sub.f32 %f2587, %f2572, %f2586; add.f32 %f2588, %f2580, %f2587; add.f32 %f2589, %f2585, %f2588; add.f32 %f2590, %f2586, %f2589; sub.f32 %f2591, %f2586, %f2590; add.f32 %f2592, %f2589, %f2591; mul.rn.f32 %f2594, %f2568, %f5364; mul.rn.f32 %f2596, %f2568, %f5365; add.f32 %f2597, %f2594, %f2590; sub.f32 %f2598, %f2594, %f2597; add.f32 %f2599, %f2590, %f2598; add.f32 %f2600, %f2592, %f2599; add.f32 %f2601, %f2596, %f2600; add.f32 %f2602, %f2597, %f2601; sub.f32 %f2603, %f2597, %f2602; add.f32 %f2604, %f2601, %f2603; mul.rn.f32 %f2605, %f2558, %f2602; neg.f32 %f2606, %f2605; fma.rn.f32 %f2607, %f2558, %f2602, %f2606; fma.rn.f32 %f2608, %f2558, %f2604, %f2607; fma.rn.f32 %f2610, %f5221, %f2602, %f2608; add.rn.f32 %f2611, %f2605, %f2610; neg.f32 %f2612, %f2611; add.rn.f32 %f2613, %f2605, %f2612; add.rn.f32 %f2614, %f2613, %f2610; mov.b32 %r892, %f2611; setp.eq.s32 %p354, %r892, 1118925336; add.s32 %r893, %r892, -1; mov.b32 %f2615, %r893; add.f32 %f2616, %f2614, 0f37000000; selp.f32 %f310, %f2616, %f2614, %p354; selp.f32 %f2617, %f2615, %f2611, %p354; mul.rn.f32 %f2619, %f2617, %f5366; cvt.rzi.f32.f32 %f2620, %f2619; abs.f32 %f2621, %f2620; setp.gt.f32 %p355, %f2621, 0f42FC0000; mov.b32 %r894, %f2620; and.b32 %r895, %r894, -2147483648; or.b32 %r896, %r895, 1123811328; mov.b32 %f2622, %r896; selp.f32 %f2623, %f2622, %f2620, %p355; fma.rn.f32 %f2625, %f2623, %f5367, %f2617; fma.rn.f32 %f2627, %f2623, %f5368, %f2625; mul.f32 %f2628, %f2627, 0f3FB8AA3B; add.f32 %f2629, %f2623, 0f4B40007F; mov.b32 %r897, %f2629; shl.b32 %r898, %r897, 23; mov.b32 %f2630, %r898; ex2.approx.ftz.f32 %f2631, %f2628; mul.f32 %f311, %f2631, %f2630; setp.eq.f32 %p356, %f311, 0f7F800000; mov.f32 %f5507, 0f7F800000; @%p356 bra $L__BB0_251; fma.rn.f32 %f5507, %f311, %f310, %f311; $L__BB0_251: mov.f32 %f5232, 0f3E800000; cvt.rzi.f32.f32 %f5231, %f5232; add.f32 %f5230, %f5231, %f5231; mov.f32 %f5229, 0f3F000000; sub.f32 %f5228, %f5229, %f5230; abs.f32 %f5227, %f5228; setp.lt.f32 %p357, %f307, 0f00000000; setp.eq.f32 %p358, %f5227, 0f3F800000; and.pred %p9, %p357, %p358; setp.eq.f32 %p359, %f307, 0f00000000; @%p359 bra $L__BB0_255; bra.uni $L__BB0_252; $L__BB0_255: add.f32 %f2636, %f307, %f307; selp.f32 %f5509, %f2636, 0f00000000, %p358; bra.uni $L__BB0_256; $L__BB0_233: add.f32 %f276, %f275, 0f3F800000; add.f32 %f277, %f276, %f276; sub.f32 %f2405, %f274, %f272; mul.f32 %f278, %f213, %f213; mul.f32 %f2406, %f278, %f2405; sub.f32 %f2407, %f273, %f252; mul.f32 %f279, %f2407, %f2406; mul.f32 %f2408, %f271, %f271; fma.rn.f32 %f2409, %f270, %f270, %f2408; add.f32 %f280, %f2409, 0f00000000; fma.rn.f32 %f2410, %f277, %f280, %f279; setp.lt.f32 %p327, %f2410, 0f38D1B717; @%p327 bra $L__BB0_283; ld.global.u8 %rs22, [%rd67+48]; setp.eq.s16 %p328, %rs22, 0; setp.leu.f32 %p329, %f252, 0f38D1B717; mov.f32 %f2411, 0f38D1B717; or.pred %p330, %p329, %p328; add.f32 %f2412, %f252, 0fB8D1B717; setp.leu.f32 %p331, %f2412, %f273; or.pred %p332, %p331, %p330; sub.f32 %f2413, %f2411, %f274; setp.geu.f32 %p333, %f2413, %f273; sqrt.rn.f32 %f281, %f280; or.pred %p334, %p333, %p332; @%p334 bra $L__BB0_241; mov.f32 %f5197, 0f3F800000; sub.f32 %f2415, %f5197, %f212; mul.f32 %f2416, %f2415, %f252; mul.f32 %f282, %f2416, 0f3F000000; add.f32 %f2417, %f272, %f282; fma.rn.f32 %f2418, %f281, 0fBFB504F3, 0f00000000; mul.f32 %f2419, %f2418, %f2418; fma.rn.f32 %f2420, %f2417, %f2417, %f2419; add.f32 %f2421, %f2420, 0f00000000; sqrt.rn.f32 %f2422, %f2421; div.rn.f32 %f283, %f2417, %f2422; div.rn.f32 %f2423, %f2418, %f2422; add.f32 %f2424, %f274, %f282; mul.f32 %f2425, %f278, %f2424; sub.f32 %f2426, %f282, %f252; mul.f32 %f2427, %f2426, %f2425; mul.f32 %f2428, %f278, %f283; add.f32 %f2429, %f282, %f282; sub.f32 %f2430, %f2429, %f252; add.f32 %f2431, %f274, %f2430; mul.f32 %f284, %f2431, %f2428; mul.f32 %f2432, %f276, %f2423; mul.f32 %f2433, %f2423, %f2432; fma.rn.f32 %f2434, %f283, %f2428, %f2433; mul.f32 %f2435, %f2434, 0fC0800000; mul.f32 %f2436, %f2427, %f2435; fma.rn.f32 %f2437, %f284, %f284, %f2436; sqrt.rn.f32 %f285, %f2437; sub.f32 %f2438, %f285, %f284; add.f32 %f286, %f2434, %f2434; div.rn.f32 %f2439, %f2438, %f286; fma.rn.f32 %f5502, %f283, %f2439, %f282; sub.f32 %f2440, %f273, %f282; sub.f32 %f2441, %f5502, %f282; mul.f32 %f2442, %f2440, %f2441; setp.gt.f32 %p335, %f2442, 0f00000000; @%p335 bra $L__BB0_237; neg.f32 %f2443, %f284; sub.f32 %f2444, %f2443, %f285; div.rn.f32 %f2445, %f2444, %f286; fma.rn.f32 %f5502, %f283, %f2445, %f282; $L__BB0_237: mul.f32 %f2446, %f5502, 0fC0000000; div.rn.f32 %f2447, %f2446, %f246; add.f32 %f2448, %f2447, 0f3F800000; abs.f32 %f2449, %f2448; sqrt.rn.f32 %f290, %f2449; setp.leu.f32 %p336, %f290, 0f38D1B717; @%p336 bra $L__BB0_241; mov.f32 %f5211, 0fBF000000; div.rn.f32 %f2450, %f255, %f290; setp.lt.f32 %p337, %f2450, 0f00800000; mul.f32 %f2451, %f2450, 0f4B000000; selp.f32 %f291, %f2451, %f2450, %p337; selp.f32 %f2452, 0fC1B80000, 0f00000000, %p337; mov.b32 %r863, %f291; add.s32 %r864, %r863, -1059760811; and.b32 %r865, %r864, -8388608; sub.s32 %r866, %r863, %r865; mov.b32 %f2453, %r866; cvt.rn.f32.s32 %f2454, %r865; mov.f32 %f2455, 0f34000000; fma.rn.f32 %f2456, %f2454, %f2455, %f2452; add.f32 %f2457, %f2453, 0fBF800000; mov.f32 %f2458, 0f3E1039F6; mov.f32 %f2459, 0fBE055027; fma.rn.f32 %f2460, %f2459, %f2457, %f2458; mov.f32 %f2461, 0fBDF8CDCC; fma.rn.f32 %f2462, %f2460, %f2457, %f2461; mov.f32 %f2463, 0f3E0F2955; fma.rn.f32 %f2464, %f2462, %f2457, %f2463; mov.f32 %f2465, 0fBE2AD8B9; fma.rn.f32 %f2466, %f2464, %f2457, %f2465; mov.f32 %f2467, 0f3E4CED0B; fma.rn.f32 %f2468, %f2466, %f2457, %f2467; mov.f32 %f2469, 0fBE7FFF22; fma.rn.f32 %f2470, %f2468, %f2457, %f2469; mov.f32 %f2471, 0f3EAAAA78; fma.rn.f32 %f2472, %f2470, %f2457, %f2471; fma.rn.f32 %f2474, %f2472, %f2457, %f5211; mul.f32 %f2475, %f2457, %f2474; fma.rn.f32 %f2476, %f2475, %f2457, %f2457; mov.f32 %f2477, 0f3F317218; fma.rn.f32 %f5503, %f2456, %f2477, %f2476; setp.lt.u32 %p338, %r863, 2139095040; @%p338 bra $L__BB0_240; mov.f32 %f2478, 0f7F800000; fma.rn.f32 %f5503, %f291, %f2478, %f2478; $L__BB0_240: setp.eq.f32 %p339, %f291, 0f00000000; selp.f32 %f2479, 0fFF800000, %f5503, %p339; add.f32 %f5517, %f5517, %f2479; $L__BB0_241: mov.f32 %f5336, 0f3102E308; mov.f32 %f5335, 0fBF317218; mov.f32 %f5334, 0f3FB8AA3B; mov.f32 %f5208, 0f00000000; mov.f32 %f5196, 0f3F800000; neg.f32 %f2481, %f279; div.rn.f32 %f2482, %f2481, %f277; sqrt.rn.f32 %f297, %f2482; mov.f32 %f2483, 0f3F000000; cvt.rzi.f32.f32 %f2484, %f2483; add.f32 %f2485, %f2484, %f2484; sub.f32 %f2487, %f5196, %f2485; abs.f32 %f298, %f2487; mul.rn.f32 %f2488, %f5196, %f257; neg.f32 %f2489, %f2488; fma.rn.f32 %f2490, %f5196, %f257, %f2489; fma.rn.f32 %f2491, %f5196, %f258, %f2490; fma.rn.f32 %f2493, %f5208, %f257, %f2491; add.rn.f32 %f2494, %f2488, %f2493; neg.f32 %f2495, %f2494; add.rn.f32 %f2496, %f2488, %f2495; add.rn.f32 %f2497, %f2496, %f2493; mov.b32 %r867, %f2494; setp.eq.s32 %p340, %r867, 1118925336; add.s32 %r868, %r867, -1; mov.b32 %f2498, %r868; add.f32 %f2499, %f2497, 0f37000000; selp.f32 %f299, %f2499, %f2497, %p340; selp.f32 %f2500, %f2498, %f2494, %p340; mul.rn.f32 %f2502, %f2500, %f5334; cvt.rzi.f32.f32 %f2503, %f2502; abs.f32 %f2504, %f2503; setp.gt.f32 %p341, %f2504, 0f42FC0000; mov.b32 %r869, %f2503; and.b32 %r870, %r869, -2147483648; or.b32 %r871, %r870, 1123811328; mov.b32 %f2505, %r871; selp.f32 %f2506, %f2505, %f2503, %p341; fma.rn.f32 %f2508, %f2506, %f5335, %f2500; fma.rn.f32 %f2510, %f2506, %f5336, %f2508; mul.f32 %f2511, %f2510, 0f3FB8AA3B; add.f32 %f2512, %f2506, 0f4B40007F; mov.b32 %r872, %f2512; shl.b32 %r873, %r872, 23; mov.b32 %f2513, %r873; ex2.approx.ftz.f32 %f2514, %f2511; mul.f32 %f300, %f2514, %f2513; setp.eq.f32 %p342, %f300, 0f7F800000; mov.f32 %f5505, 0f7F800000; @%p342 bra $L__BB0_243; fma.rn.f32 %f5505, %f300, %f299, %f300; $L__BB0_243: mov.f32 %f5173, 0f3F800000; cvt.rzi.f32.f32 %f5172, %f5173; setp.eq.f32 %p344, %f298, 0f3F800000; and.pred %p8, %p315, %p344; mov.b32 %r874, %f5505; xor.b32 %r875, %r874, -2147483648; mov.b32 %f2515, %r875; selp.f32 %f2516, %f2515, %f5505, %p8; add.f32 %f2517, %f255, %f255; selp.f32 %f2518, %f2517, 0f00000000, %p344; setp.neu.f32 %p345, %f5172, 0f3F800000; and.pred %p346, %p315, %p345; selp.f32 %f2519, 0f7FFFFFFF, %f2516, %p346; selp.f32 %f5506, %f2518, %f2519, %p317; @%p321 bra $L__BB0_248; abs.f32 %f5209, %f255; setp.gtu.f32 %p349, %f5209, 0f7F800000; @%p349 bra $L__BB0_247; bra.uni $L__BB0_245; $L__BB0_247: add.f32 %f5506, %f255, 0f3F800000; bra.uni $L__BB0_248; $L__BB0_269: mov.b32 %r929, %f5512; xor.b32 %r930, %r929, -2147483648; mov.b32 %f2772, %r930; selp.f32 %f5514, %f2772, %f5512, %p10; setp.geu.f32 %p379, %f326, 0f00000000; @%p379 bra $L__BB0_273; cvt.rzi.f32.f32 %f2774, %f2698; setp.eq.f32 %p380, %f2774, 0f3F000000; @%p380 bra $L__BB0_273; mov.f32 %f5514, 0f7FFFFFFF; $L__BB0_273: add.f32 %f2777, %f328, 0f3F000000; mov.b32 %r931, %f2777; setp.lt.s32 %p382, %r931, 2139095040; @%p382 bra $L__BB0_278; setp.gtu.f32 %p383, %f328, 0f7F800000; @%p383 bra $L__BB0_277; bra.uni $L__BB0_275; $L__BB0_277: add.f32 %f5514, %f326, 0f3F000000; bra.uni $L__BB0_278; $L__BB0_158: mov.b32 %r61, %f173; bfe.u32 %r614, %r61, 23, 8; add.s32 %r62, %r614, -128; shl.b32 %r615, %r61, 8; or.b32 %r63, %r615, -2147483648; shr.u32 %r64, %r62, 5; add.u64 %rd634, %SP, 32; add.u64 %rd1330, %SPL, 32; mov.u32 %r1733, 0; mov.u64 %rd1329, __cudart_i2opi_f; mov.u32 %r1734, %r1733; $L__BB0_159: .pragma "nounroll"; mov.u32 %r66, %r1734; ld.global.nc.u32 %r618, [%rd1329]; // begin inline asm { mad.lo.cc.u32 %r616, %r618, %r63, %r66; madc.hi.u32 %r1734, %r618, %r63, 0; } // end inline asm st.local.u32 [%rd1330], %r616; add.s64 %rd1330, %rd1330, 4; add.s64 %rd1329, %rd1329, 4; add.s32 %r1733, %r1733, 1; setp.ne.s32 %p226, %r1733, 6; @%p226 bra $L__BB0_159; mov.u32 %r623, -1560706194; // begin inline asm { mad.lo.cc.u32 %r621, %r623, %r63, %r66; madc.hi.u32 %r622, %r623, %r63, 0; } // end inline asm st.local.u32 [%rd94], %r622; mov.u32 %r626, 4; sub.s32 %r69, %r626, %r64; mov.u32 %r627, 6; sub.s32 %r628, %r627, %r64; cvta.to.local.u64 %rd636, %rd634; mul.wide.s32 %rd637, %r628, 4; add.s64 %rd638, %rd636, %rd637; ld.local.u32 %r1735, [%rd638]; ld.local.u32 %r1736, [%rd638+-4]; and.b32 %r72, %r62, 31; setp.eq.s32 %p227, %r72, 0; @%p227 bra $L__BB0_162; mov.u32 %r629, 32; sub.s32 %r630, %r629, %r72; shr.u32 %r631, %r1736, %r630; shl.b32 %r632, %r1735, %r72; add.s32 %r1735, %r631, %r632; mul.wide.s32 %rd641, %r69, 4; add.s64 %rd642, %rd636, %rd641; ld.local.u32 %r633, [%rd642]; shr.u32 %r634, %r633, %r630; shl.b32 %r635, %r1736, %r72; add.s32 %r1736, %r634, %r635; $L__BB0_162: and.b32 %r636, %r61, -2147483648; shr.u32 %r637, %r1736, 30; shl.b32 %r638, %r1735, 2; or.b32 %r639, %r637, %r638; shr.u32 %r640, %r639, 31; shr.u32 %r641, %r1735, 30; add.s32 %r642, %r640, %r641; neg.s32 %r643, %r642; setp.eq.s32 %p228, %r636, 0; selp.b32 %r1737, %r642, %r643, %p228; setp.ne.s32 %p229, %r640, 0; xor.b32 %r644, %r636, -2147483648; selp.b32 %r645, %r644, %r636, %p229; selp.b32 %r646, -1, 0, %p229; xor.b32 %r647, %r639, %r646; shl.b32 %r648, %r1736, 2; xor.b32 %r649, %r648, %r646; cvt.u64.u32 %rd643, %r647; cvt.u64.u32 %rd644, %r649; bfi.b64 %rd645, %rd643, %rd644, 32, 32; cvt.rn.f64.s64 %fd5, %rd645; mul.f64 %fd6, %fd5, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f1965, %fd6; setp.eq.s32 %p230, %r645, 0; neg.f32 %f1966, %f1965; selp.f32 %f5488, %f1965, %f1966, %p230; $L__BB0_164: mul.f32 %f1968, %f174, 0f3F22F983; cvt.rni.s32.f32 %r1742, %f1968; cvt.rn.f32.s32 %f1969, %r1742; fma.rn.f32 %f1971, %f1969, %f1960, %f174; fma.rn.f32 %f1973, %f1969, %f1962, %f1971; fma.rn.f32 %f5489, %f1969, %f1964, %f1973; abs.f32 %f181, %f174; setp.leu.f32 %p231, %f181, 0f47CE4780; @%p231 bra $L__BB0_172; setp.eq.f32 %p232, %f181, 0f7F800000; @%p232 bra $L__BB0_171; bra.uni $L__BB0_166; $L__BB0_171: mul.rn.f32 %f5489, %f174, %f1058; bra.uni $L__BB0_172; $L__BB0_131: mov.b32 %r19, %f140; bfe.u32 %r497, %r19, 23, 8; add.s32 %r20, %r497, -128; shl.b32 %r498, %r19, 8; or.b32 %r21, %r498, -2147483648; shr.u32 %r22, %r20, 5; cvta.to.local.u64 %rd1326, %rd1360; mov.u32 %r1723, 0; mov.u64 %rd1325, __cudart_i2opi_f; mov.u32 %r1724, %r1723; $L__BB0_132: .pragma "nounroll"; mov.u32 %r24, %r1724; ld.global.nc.u32 %r501, [%rd1325]; // begin inline asm { mad.lo.cc.u32 %r499, %r501, %r21, %r24; madc.hi.u32 %r1724, %r501, %r21, 0; } // end inline asm st.local.u32 [%rd1326], %r499; add.s64 %rd1326, %rd1326, 4; add.s64 %rd1325, %rd1325, 4; add.s32 %r1723, %r1723, 1; setp.ne.s32 %p185, %r1723, 6; @%p185 bra $L__BB0_132; mov.u32 %r506, -1560706194; // begin inline asm { mad.lo.cc.u32 %r504, %r506, %r21, %r24; madc.hi.u32 %r505, %r506, %r21, 0; } // end inline asm st.local.u32 [%rd94], %r505; mov.u32 %r509, 4; sub.s32 %r27, %r509, %r22; mov.u32 %r510, 6; sub.s32 %r511, %r510, %r22; cvta.to.local.u64 %rd592, %rd1360; mul.wide.s32 %rd593, %r511, 4; add.s64 %rd594, %rd592, %rd593; ld.local.u32 %r1725, [%rd594]; ld.local.u32 %r1726, [%rd594+-4]; and.b32 %r30, %r20, 31; setp.eq.s32 %p186, %r30, 0; @%p186 bra $L__BB0_135; mov.u32 %r512, 32; sub.s32 %r513, %r512, %r30; shr.u32 %r514, %r1726, %r513; shl.b32 %r515, %r1725, %r30; add.s32 %r1725, %r514, %r515; mul.wide.s32 %rd597, %r27, 4; add.s64 %rd598, %rd592, %rd597; ld.local.u32 %r516, [%rd598]; shr.u32 %r517, %r516, %r513; shl.b32 %r518, %r1726, %r30; add.s32 %r1726, %r517, %r518; $L__BB0_135: and.b32 %r519, %r19, -2147483648; shr.u32 %r520, %r1726, 30; shl.b32 %r521, %r1725, 2; or.b32 %r522, %r520, %r521; shr.u32 %r523, %r522, 31; shr.u32 %r524, %r1725, 30; add.s32 %r525, %r523, %r524; neg.s32 %r526, %r525; setp.eq.s32 %p187, %r519, 0; selp.b32 %r1727, %r525, %r526, %p187; setp.ne.s32 %p188, %r523, 0; xor.b32 %r527, %r519, -2147483648; selp.b32 %r528, %r527, %r519, %p188; selp.b32 %r529, -1, 0, %p188; xor.b32 %r530, %r522, %r529; shl.b32 %r531, %r1726, 2; xor.b32 %r532, %r531, %r529; cvt.u64.u32 %rd599, %r530; cvt.u64.u32 %rd600, %r532; bfi.b64 %rd601, %rd599, %rd600, 32, 32; cvt.rn.f64.s64 %fd1, %rd601; mul.f64 %fd2, %fd1, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f1778, %fd2; setp.eq.s32 %p189, %r528, 0; neg.f32 %f1779, %f1778; selp.f32 %f5484, %f1778, %f1779, %p189; $L__BB0_137: mul.f32 %f1781, %f141, 0f3F22F983; cvt.rni.s32.f32 %r1732, %f1781; cvt.rn.f32.s32 %f1782, %r1732; fma.rn.f32 %f1784, %f1782, %f1773, %f141; fma.rn.f32 %f1786, %f1782, %f1775, %f1784; fma.rn.f32 %f5485, %f1782, %f1777, %f1786; abs.f32 %f148, %f141; setp.leu.f32 %p190, %f148, 0f47CE4780; @%p190 bra $L__BB0_145; setp.eq.f32 %p191, %f148, 0f7F800000; @%p191 bra $L__BB0_144; bra.uni $L__BB0_139; $L__BB0_144: mul.rn.f32 %f5485, %f141, %f1058; bra.uni $L__BB0_145; $L__BB0_166: mov.b32 %r80, %f174; bfe.u32 %r652, %r80, 23, 8; add.s32 %r81, %r652, -128; shl.b32 %r653, %r80, 8; or.b32 %r82, %r653, -2147483648; shr.u32 %r83, %r81, 5; add.u64 %rd647, %SP, 32; add.u64 %rd1332, %SPL, 32; mov.u32 %r1738, 0; mov.u64 %rd1331, __cudart_i2opi_f; mov.u32 %r1739, %r1738; $L__BB0_167: .pragma "nounroll"; mov.u32 %r85, %r1739; ld.global.nc.u32 %r656, [%rd1331]; // begin inline asm { mad.lo.cc.u32 %r654, %r656, %r82, %r85; madc.hi.u32 %r1739, %r656, %r82, 0; } // end inline asm st.local.u32 [%rd1332], %r654; add.s64 %rd1332, %rd1332, 4; add.s64 %rd1331, %rd1331, 4; add.s32 %r1738, %r1738, 1; setp.ne.s32 %p233, %r1738, 6; @%p233 bra $L__BB0_167; mov.u32 %r661, -1560706194; // begin inline asm { mad.lo.cc.u32 %r659, %r661, %r82, %r85; madc.hi.u32 %r660, %r661, %r82, 0; } // end inline asm st.local.u32 [%rd94], %r660; mov.u32 %r664, 4; sub.s32 %r88, %r664, %r83; mov.u32 %r665, 6; sub.s32 %r666, %r665, %r83; cvta.to.local.u64 %rd649, %rd647; mul.wide.s32 %rd650, %r666, 4; add.s64 %rd651, %rd649, %rd650; ld.local.u32 %r1740, [%rd651]; ld.local.u32 %r1741, [%rd651+-4]; and.b32 %r91, %r81, 31; setp.eq.s32 %p234, %r91, 0; @%p234 bra $L__BB0_170; mov.u32 %r667, 32; sub.s32 %r668, %r667, %r91; shr.u32 %r669, %r1741, %r668; shl.b32 %r670, %r1740, %r91; add.s32 %r1740, %r669, %r670; mul.wide.s32 %rd654, %r88, 4; add.s64 %rd655, %rd649, %rd654; ld.local.u32 %r671, [%rd655]; shr.u32 %r672, %r671, %r668; shl.b32 %r673, %r1741, %r91; add.s32 %r1741, %r672, %r673; $L__BB0_170: and.b32 %r674, %r80, -2147483648; shr.u32 %r675, %r1741, 30; shl.b32 %r676, %r1740, 2; or.b32 %r677, %r675, %r676; shr.u32 %r678, %r677, 31; shr.u32 %r679, %r1740, 30; add.s32 %r680, %r678, %r679; neg.s32 %r681, %r680; setp.eq.s32 %p235, %r674, 0; selp.b32 %r1742, %r680, %r681, %p235; setp.ne.s32 %p236, %r678, 0; xor.b32 %r682, %r674, -2147483648; selp.b32 %r683, %r682, %r674, %p236; selp.b32 %r684, -1, 0, %p236; xor.b32 %r685, %r677, %r684; shl.b32 %r686, %r1741, 2; xor.b32 %r687, %r686, %r684; cvt.u64.u32 %rd656, %r685; cvt.u64.u32 %rd657, %r687; bfi.b64 %rd658, %rd656, %rd657, 32, 32; cvt.rn.f64.s64 %fd7, %rd658; mul.f64 %fd8, %fd7, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f1975, %fd8; setp.eq.s32 %p237, %r683, 0; neg.f32 %f1976, %f1975; selp.f32 %f5489, %f1975, %f1976, %p237; $L__BB0_172: mov.f32 %f5194, 0f3F800000; setp.lt.f32 %p238, %f160, 0f00000000; selp.f32 %f1979, 0fBF800000, 0f3F800000, %p238; mul.f32 %f185, %f160, %f1979; mul.f32 %f1981, %f5488, %f5488; mov.f32 %f1982, 0fBAB607ED; mov.f32 %f1983, 0f37CBAC00; fma.rn.f32 %f1984, %f1983, %f1981, %f1982; mov.f32 %f1985, 0f3D2AAABB; fma.rn.f32 %f1986, %f1984, %f1981, %f1985; mov.f32 %f1987, 0fBEFFFFFF; fma.rn.f32 %f1988, %f1986, %f1981, %f1987; fma.rn.f32 %f1989, %f1988, %f1981, %f5194; mov.f32 %f1990, 0f3C0885E4; mov.f32 %f1991, 0fB94D4153; fma.rn.f32 %f1992, %f1991, %f1981, %f1990; mov.f32 %f1993, 0fBE2AAAA8; fma.rn.f32 %f1994, %f1992, %f1981, %f1993; fma.rn.f32 %f1995, %f1981, %f5488, %f1058; fma.rn.f32 %f1996, %f1994, %f1995, %f5488; and.b32 %r688, %r1737, 1; setp.eq.b32 %p239, %r688, 1; selp.f32 %f1997, %f1989, %f1996, %p239; selp.f32 %f1998, %f1996, %f1989, %p239; neg.f32 %f1999, %f1997; and.b32 %r689, %r1737, 2; setp.eq.s32 %p240, %r689, 0; selp.f32 %f2000, %f1997, %f1999, %p240; neg.f32 %f2001, %f1998; add.s32 %r690, %r1737, 1; and.b32 %r691, %r690, 2; setp.eq.s32 %p241, %r691, 0; selp.f32 %f2002, %f1998, %f2001, %p241; mul.f32 %f2003, %f5489, %f5489; fma.rn.f32 %f2004, %f1983, %f2003, %f1982; fma.rn.f32 %f2005, %f2004, %f2003, %f1985; fma.rn.f32 %f2006, %f2005, %f2003, %f1987; fma.rn.f32 %f2007, %f2006, %f2003, %f5194; fma.rn.f32 %f2008, %f2003, %f5489, %f1058; fma.rn.f32 %f2009, %f1991, %f2003, %f1990; fma.rn.f32 %f2010, %f2009, %f2003, %f1993; fma.rn.f32 %f2011, %f2010, %f2008, %f5489; and.b32 %r692, %r1742, 1; setp.eq.b32 %p242, %r692, 1; selp.f32 %f2012, %f2007, %f2011, %p242; selp.f32 %f2013, %f2011, %f2007, %p242; and.b32 %r693, %r1742, 2; setp.eq.s32 %p243, %r693, 0; neg.f32 %f2014, %f2012; selp.f32 %f2015, %f2012, %f2014, %p243; add.s32 %r694, %r1742, 1; and.b32 %r695, %r694, 2; setp.eq.s32 %p244, %r695, 0; neg.f32 %f2016, %f2013; selp.f32 %f2017, %f2013, %f2016, %p244; mov.b32 %r696, %f2017; neg.f32 %f2018, %f2015; mov.b32 %r697, %f2015; cvt.u64.u32 %rd659, %r697; cvt.u64.u32 %rd660, %r696; bfi.b64 %rd117, %rd659, %rd660, 32, 32; mov.b32 %r698, %f2018; cvt.u64.u32 %rd661, %r698; bfi.b64 %rd118, %rd660, %rd661, 32, 32; mul.f32 %f2019, %f1979, %f2000; mov.b32 %r699, %f2019; cvt.u64.u32 %rd662, %r699; mov.b32 %r700, %f2002; cvt.u64.u32 %rd663, %r700; bfi.b64 %rd119, %rd662, %rd663, 32, 32; neg.f32 %f2020, %f2000; mov.b32 %r701, %f2020; mul.f32 %f2021, %f1979, %f2002; mov.b32 %r702, %f2021; cvt.u64.u32 %rd664, %r702; cvt.u64.u32 %rd665, %r701; bfi.b64 %rd120, %rd664, %rd665, 32, 32; mul.f32 %f2022, %f159, 0f4B000000; setp.lt.f32 %p245, %f159, 0f00800000; selp.f32 %f186, %f2022, %f159, %p245; selp.f32 %f2023, 0fC1B80000, 0f00000000, %p245; mov.b32 %r703, %f186; add.s32 %r704, %r703, -1059760811; and.b32 %r705, %r704, -8388608; sub.s32 %r706, %r703, %r705; mov.b32 %f2024, %r706; cvt.rn.f32.s32 %f2025, %r705; mov.f32 %f2026, 0f34000000; fma.rn.f32 %f2027, %f2025, %f2026, %f2023; add.f32 %f2028, %f2024, 0fBF800000; mov.f32 %f2029, 0f3E1039F6; mov.f32 %f2030, 0fBE055027; fma.rn.f32 %f2031, %f2030, %f2028, %f2029; mov.f32 %f2032, 0fBDF8CDCC; fma.rn.f32 %f2033, %f2031, %f2028, %f2032; mov.f32 %f2034, 0f3E0F2955; fma.rn.f32 %f2035, %f2033, %f2028, %f2034; mov.f32 %f2036, 0fBE2AD8B9; fma.rn.f32 %f2037, %f2035, %f2028, %f2036; mov.f32 %f2038, 0f3E4CED0B; fma.rn.f32 %f2039, %f2037, %f2028, %f2038; mov.f32 %f2040, 0fBE7FFF22; fma.rn.f32 %f2041, %f2039, %f2028, %f2040; mov.f32 %f2042, 0f3EAAAA78; fma.rn.f32 %f2043, %f2041, %f2028, %f2042; mov.f32 %f2044, 0fBF000000; fma.rn.f32 %f2045, %f2043, %f2028, %f2044; mul.f32 %f2046, %f2028, %f2045; fma.rn.f32 %f2047, %f2046, %f2028, %f2028; mov.f32 %f2048, 0f3F317218; fma.rn.f32 %f5490, %f2027, %f2048, %f2047; setp.lt.u32 %p246, %r703, 2139095040; @%p246 bra $L__BB0_174; mov.f32 %f2049, 0f7F800000; fma.rn.f32 %f5490, %f186, %f2049, %f2049; $L__BB0_174: setp.eq.f32 %p247, %f186, 0f00000000; selp.f32 %f190, 0fFF800000, %f5490, %p247; mul.f32 %f2050, %f185, 0f4B000000; setp.lt.f32 %p248, %f185, 0f00800000; selp.f32 %f191, %f2050, %f185, %p248; selp.f32 %f2051, 0fC1B80000, 0f00000000, %p248; mov.b32 %r707, %f191; add.s32 %r708, %r707, -1059760811; and.b32 %r709, %r708, -8388608; sub.s32 %r710, %r707, %r709; mov.b32 %f2052, %r710; cvt.rn.f32.s32 %f2053, %r709; fma.rn.f32 %f2055, %f2053, %f2026, %f2051; add.f32 %f2056, %f2052, 0fBF800000; fma.rn.f32 %f2059, %f2030, %f2056, %f2029; fma.rn.f32 %f2061, %f2059, %f2056, %f2032; fma.rn.f32 %f2063, %f2061, %f2056, %f2034; fma.rn.f32 %f2065, %f2063, %f2056, %f2036; fma.rn.f32 %f2067, %f2065, %f2056, %f2038; fma.rn.f32 %f2069, %f2067, %f2056, %f2040; fma.rn.f32 %f2071, %f2069, %f2056, %f2042; fma.rn.f32 %f2073, %f2071, %f2056, %f2044; mul.f32 %f2074, %f2056, %f2073; fma.rn.f32 %f2075, %f2074, %f2056, %f2056; fma.rn.f32 %f5491, %f2055, %f2048, %f2075; setp.lt.u32 %p249, %r707, 2139095040; @%p249 bra $L__BB0_176; mov.f32 %f2077, 0f7F800000; fma.rn.f32 %f5491, %f191, %f2077, %f2077; $L__BB0_176: mov.u64 %rd1333, 0; setp.eq.f32 %p250, %f191, 0f00000000; selp.f32 %f195, 0fFF800000, %f5491, %p250; mov.b32 %r711, %f195; cvt.u64.u32 %rd668, %r711; mov.b32 %r712, %f190; cvt.u64.u32 %rd669, %r712; bfi.b64 %rd670, %rd668, %rd669, 32, 32; add.u64 %rd672, %SPL, 16; st.local.u64 [%rd672], %rd670; add.u64 %rd673, %SP, 32; add.u64 %rd674, %SPL, 32; st.local.u64 [%rd674], %rd1333; add.u64 %rd1335, %SP, 0; cvta.to.local.u64 %rd676, %rd1335; mov.u64 %rd1337, 1; st.local.u64 [%rd676], %rd1337; setp.le.f32 %p251, %f195, %f190; setp.ge.f32 %p252, %f195, %f190; selp.b16 %rs13, 1, 2, %p252; setp.ltu.f32 %p253, %f195, %f190; selp.b16 %rs14, -1, 0, %p253; selp.b16 %rs15, %rs14, %rs13, %p251; setp.ne.s16 %p254, %rs15, -1; mov.f32 %f5492, %f195; mov.u64 %rd1334, %rd1337; @%p254 bra $L__BB0_178; add.u64 %rd680, %SPL, 0; mov.u64 %rd1334, 0; st.local.u64 [%rd680], %rd1334; add.u64 %rd682, %SPL, 32; mov.u64 %rd1333, 1; st.local.u64 [%rd682], %rd1333; mov.f32 %f5492, %f190; $L__BB0_178: setp.ge.f32 %p255, %f195, %f5492; selp.b16 %rs16, 1, 2, %p255; setp.ltu.f32 %p256, %f195, %f5492; selp.b16 %rs17, -1, 0, %p256; setp.le.f32 %p257, %f195, %f5492; selp.b16 %rs18, %rs17, %rs16, %p257; setp.ne.s16 %p258, %rs18, -1; mov.u64 %rd1336, %rd1334; @%p258 bra $L__BB0_182; shl.b64 %rd687, %rd1333, 2; add.s64 %rd688, %rd672, %rd687; ld.local.f32 %f2078, [%rd688]; setp.le.f32 %p259, %f195, %f2078; setp.ge.f32 %p260, %f195, %f2078; selp.b16 %rs19, 1, 2, %p260; setp.ltu.f32 %p261, %f195, %f2078; selp.b16 %rs20, -1, 0, %p261; selp.b16 %rs21, %rs20, %rs19, %p259; setp.ne.s16 %p262, %rs21, -1; @%p262 bra $L__BB0_181; add.u64 %rd691, %SPL, 0; st.local.u64 [%rd691], %rd1333; mov.u64 %rd1335, %rd673; $L__BB0_181: cvta.to.local.u64 %rd692, %rd1335; mov.u64 %rd693, 1; st.local.u64 [%rd692], %rd693; ld.local.u64 %rd1336, [%rd676]; mov.u64 %rd1337, %rd1334; $L__BB0_182: ld.f32 %f197, [%rd106]; add.f32 %f2079, %f197, 0fBF800000; ld.global.f32 %f198, [%rd67+48]; sub.f32 %f199, %f198, %f2079; add.f32 %f2080, %f190, 0f00000000; add.f32 %f200, %f2080, %f195; shl.b64 %rd698, %rd1337, 2; add.s64 %rd127, %rd672, %rd698; ld.local.f32 %f201, [%rd127]; add.f32 %f202, %f156, %f156; mul.f32 %f2081, %f202, %f201; fma.rn.f32 %f2082, %f155, %f200, %f2081; setp.gtu.f32 %p263, %f2082, %f199; @%p263 bra $L__BB0_184; bra.uni $L__BB0_341; $L__BB0_184: add.f32 %f203, %f155, %f202; setp.gt.u64 %p264, %rd1336, 1; @%p264 bra $L__BB0_189; shl.b64 %rd701, %rd1336, 2; add.s64 %rd702, %rd672, %rd701; ld.local.f32 %f2083, [%rd702]; sub.f32 %f2084, %f200, %f201; mul.f32 %f204, %f155, %f2084; fma.rn.f32 %f2085, %f203, %f2083, %f204; setp.gtu.f32 %p265, %f2085, %f199; @%p265 bra $L__BB0_187; bra.uni $L__BB0_186; $L__BB0_187: fma.rn.f32 %f2088, %f155, 0f40400000, %f202; div.rn.f32 %f5493, %f199, %f2088; mov.b32 %r713, %f5493; st.local.v2.f32 [%rd672], {%f5493, %f5493}; mov.b64 %rd1338, {%r713, %r713}; bra.uni $L__BB0_188; $L__BB0_186: sub.f32 %f2086, %f199, %f204; div.rn.f32 %f2087, %f2086, %f203; st.local.f32 [%rd127], %f2087; ld.local.f32 %f5493, [%rd672+4]; ld.local.u64 %rd1338, [%rd672]; $L__BB0_188: mov.f32 %f5396, 0f3FB8AA3B; cvt.u32.u64 %r714, %rd1338; mov.b32 %f2089, %r714; shr.u64 %rd707, %rd1338, 32; cvt.u32.u64 %r715, %rd707; mov.b32 %f2090, %r715; sub.f32 %f2091, %f190, %f2089; sub.f32 %f2092, %f195, %f2090; mul.f32 %f2093, %f2092, %f2092; fma.rn.f32 %f2094, %f2091, %f2091, %f2093; add.f32 %f2095, %f2094, 0f00000000; sqrt.rn.f32 %f2096, %f2095; ld.global.f32 %f2097, [%rd67+52]; fma.rn.f32 %f2098, %f2097, %f2096, %f197; min.f32 %f2099, %f2098, %f198; st.f32 [%rd106], %f2099; mov.f32 %f2100, 0f3F000000; mov.f32 %f2101, 0f3BBB989D; fma.rn.f32 %f2102, %f2089, %f2101, %f2100; mov.f32 %f2104, 0f437C0000; cvt.sat.f32.f32 %f2105, %f2102; mov.f32 %f2106, 0f4B400001; fma.rm.f32 %f2107, %f2105, %f2104, %f2106; add.f32 %f2108, %f2107, 0fCB40007F; neg.f32 %f2109, %f2108; fma.rn.f32 %f2110, %f2089, %f5396, %f2109; mov.f32 %f2111, 0f32A57060; fma.rn.f32 %f2112, %f2089, %f2111, %f2110; mov.b32 %r716, %f2107; shl.b32 %r717, %r716, 23; mov.b32 %f2113, %r717; ex2.approx.ftz.f32 %f2114, %f2112; mul.f32 %f2115, %f2114, %f2113; fma.rn.f32 %f2116, %f5493, %f2101, %f2100; cvt.sat.f32.f32 %f2117, %f2116; fma.rm.f32 %f2118, %f2117, %f2104, %f2106; add.f32 %f2119, %f2118, 0fCB40007F; neg.f32 %f2120, %f2119; fma.rn.f32 %f2121, %f5493, %f5396, %f2120; fma.rn.f32 %f2122, %f5493, %f2111, %f2121; mov.b32 %r718, %f2118; shl.b32 %r719, %r718, 23; mov.b32 %f2123, %r719; ex2.approx.ftz.f32 %f2124, %f2122; mul.f32 %f2125, %f2124, %f2123; mov.b64 {%r720, %r721}, %rd118; mov.b64 {%r722, %r723}, %rd117; mov.b32 %f2126, %r722; mul.f32 %f2127, %f2126, %f2115; mov.b32 %f2128, %r723; mul.f32 %f2129, %f2128, %f2115; mov.b32 %f2130, %r720; mul.f32 %f2131, %f2130, %f2125; mov.b32 %f2132, %r721; mul.f32 %f2133, %f2132, %f2125; mov.b64 {%r724, %r725}, %rd120; mov.b64 {%r726, %r727}, %rd119; mov.b32 %f2134, %r726; mov.b32 %f2135, %r727; mul.f32 %f2136, %f2135, %f2131; mul.f32 %f2137, %f2135, %f2133; fma.rn.f32 %f5531, %f2134, %f2129, %f2137; mov.b32 %f2138, %r724; mov.b32 %f2139, %r725; mul.f32 %f2140, %f2139, %f2131; fma.rn.f32 %f5530, %f2138, %f2127, %f2140; mul.f32 %f2141, %f2139, %f2133; fma.rn.f32 %f2142, %f2138, %f2129, %f2141; fma.rn.f32 %f2143, %f2134, %f2127, %f2136; st.local.v4.f32 [%rd522], {%f2143, %f5531, %f5530, %f2142}; bra.uni $L__BB0_341; $L__BB0_139: mov.b32 %r38, %f141; bfe.u32 %r535, %r38, 23, 8; add.s32 %r39, %r535, -128; shl.b32 %r536, %r38, 8; or.b32 %r40, %r536, -2147483648; shr.u32 %r41, %r39, 5; mov.u32 %r1728, 0; mov.u64 %rd1327, __cudart_i2opi_f; mov.u32 %r1729, %r1728; $L__BB0_140: .pragma "nounroll"; mov.u32 %r43, %r1729; ld.global.nc.u32 %r539, [%rd1327]; // begin inline asm { mad.lo.cc.u32 %r537, %r539, %r40, %r43; madc.hi.u32 %r1729, %r539, %r40, 0; } // end inline asm st.local.u32 [%rd1328], %r537; add.s64 %rd1328, %rd1328, 4; add.s64 %rd1327, %rd1327, 4; add.s32 %r1728, %r1728, 1; setp.ne.s32 %p192, %r1728, 6; @%p192 bra $L__BB0_140; mov.u32 %r544, -1560706194; // begin inline asm { mad.lo.cc.u32 %r542, %r544, %r40, %r43; madc.hi.u32 %r543, %r544, %r40, 0; } // end inline asm st.local.u32 [%rd94], %r543; mov.u32 %r547, 4; sub.s32 %r46, %r547, %r41; mov.u32 %r548, 6; sub.s32 %r549, %r548, %r41; cvta.to.local.u64 %rd605, %rd1360; mul.wide.s32 %rd606, %r549, 4; add.s64 %rd607, %rd605, %rd606; ld.local.u32 %r1730, [%rd607]; ld.local.u32 %r1731, [%rd607+-4]; and.b32 %r49, %r39, 31; setp.eq.s32 %p193, %r49, 0; @%p193 bra $L__BB0_143; mov.u32 %r550, 32; sub.s32 %r551, %r550, %r49; shr.u32 %r552, %r1731, %r551; shl.b32 %r553, %r1730, %r49; add.s32 %r1730, %r552, %r553; mul.wide.s32 %rd610, %r46, 4; add.s64 %rd611, %rd605, %rd610; ld.local.u32 %r554, [%rd611]; shr.u32 %r555, %r554, %r551; shl.b32 %r556, %r1731, %r49; add.s32 %r1731, %r555, %r556; $L__BB0_143: and.b32 %r557, %r38, -2147483648; shr.u32 %r558, %r1731, 30; shl.b32 %r559, %r1730, 2; or.b32 %r560, %r558, %r559; shr.u32 %r561, %r560, 31; shr.u32 %r562, %r1730, 30; add.s32 %r563, %r561, %r562; neg.s32 %r564, %r563; setp.eq.s32 %p194, %r557, 0; selp.b32 %r1732, %r563, %r564, %p194; setp.ne.s32 %p195, %r561, 0; xor.b32 %r565, %r557, -2147483648; selp.b32 %r566, %r565, %r557, %p195; selp.b32 %r567, -1, 0, %p195; xor.b32 %r568, %r560, %r567; shl.b32 %r569, %r1731, 2; xor.b32 %r570, %r569, %r567; cvt.u64.u32 %rd612, %r568; cvt.u64.u32 %rd613, %r570; bfi.b64 %rd614, %rd612, %rd613, 32, 32; cvt.rn.f64.s64 %fd3, %rd614; mul.f64 %fd4, %fd3, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f1788, %fd4; setp.eq.s32 %p196, %r566, 0; neg.f32 %f1789, %f1788; selp.f32 %f5485, %f1788, %f1789, %p196; $L__BB0_145: mov.f32 %f5395, 0f3FB8AA3B; mov.f32 %f5193, 0f3F800000; setp.lt.f32 %p197, %f127, 0f00000000; selp.f32 %f1792, 0fBF800000, 0f3F800000, %p197; mul.f32 %f1794, %f127, %f1792; mul.f32 %f1795, %f5484, %f5484; mov.f32 %f1796, 0fBAB607ED; mov.f32 %f1797, 0f37CBAC00; fma.rn.f32 %f1798, %f1797, %f1795, %f1796; mov.f32 %f1799, 0f3D2AAABB; fma.rn.f32 %f1800, %f1798, %f1795, %f1799; mov.f32 %f1801, 0fBEFFFFFF; fma.rn.f32 %f1802, %f1800, %f1795, %f1801; fma.rn.f32 %f1803, %f1802, %f1795, %f5193; mov.f32 %f1804, 0f3C0885E4; mov.f32 %f1805, 0fB94D4153; fma.rn.f32 %f1806, %f1805, %f1795, %f1804; mov.f32 %f1807, 0fBE2AAAA8; fma.rn.f32 %f1808, %f1806, %f1795, %f1807; fma.rn.f32 %f1809, %f1795, %f5484, %f1058; fma.rn.f32 %f1810, %f1808, %f1809, %f5484; and.b32 %r571, %r1727, 1; setp.eq.b32 %p198, %r571, 1; selp.f32 %f1811, %f1803, %f1810, %p198; selp.f32 %f1812, %f1810, %f1803, %p198; neg.f32 %f1813, %f1811; and.b32 %r572, %r1727, 2; setp.eq.s32 %p199, %r572, 0; selp.f32 %f1814, %f1811, %f1813, %p199; neg.f32 %f1815, %f1812; add.s32 %r573, %r1727, 1; and.b32 %r574, %r573, 2; setp.eq.s32 %p200, %r574, 0; selp.f32 %f1816, %f1812, %f1815, %p200; mul.f32 %f1817, %f5485, %f5485; fma.rn.f32 %f1818, %f1797, %f1817, %f1796; fma.rn.f32 %f1819, %f1818, %f1817, %f1799; fma.rn.f32 %f1820, %f1819, %f1817, %f1801; fma.rn.f32 %f1821, %f1820, %f1817, %f5193; fma.rn.f32 %f1822, %f1817, %f5485, %f1058; fma.rn.f32 %f1823, %f1805, %f1817, %f1804; fma.rn.f32 %f1824, %f1823, %f1817, %f1807; fma.rn.f32 %f1825, %f1824, %f1822, %f5485; and.b32 %r575, %r1732, 1; setp.eq.b32 %p201, %r575, 1; selp.f32 %f1826, %f1821, %f1825, %p201; selp.f32 %f1827, %f1825, %f1821, %p201; and.b32 %r576, %r1732, 2; setp.eq.s32 %p202, %r576, 0; neg.f32 %f1828, %f1826; selp.f32 %f1829, %f1826, %f1828, %p202; add.s32 %r577, %r1732, 1; and.b32 %r578, %r577, 2; setp.eq.s32 %p203, %r578, 0; neg.f32 %f1830, %f1827; selp.f32 %f1831, %f1827, %f1830, %p203; mov.b32 %r579, %f1831; neg.f32 %f1832, %f1829; mov.b32 %r580, %f1829; cvt.u64.u32 %rd615, %r580; mov.b32 %r581, %f1832; cvt.u64.u32 %rd616, %r581; cvt.u64.u32 %rd617, %r579; bfi.b64 %rd618, %rd617, %rd616, 32, 32; mov.b64 {%r582, %r583}, %rd618; bfi.b64 %rd619, %rd615, %rd617, 32, 32; mov.b64 {%r584, %r585}, %rd619; mul.f32 %f1833, %f1792, %f1814; mov.b32 %r586, %f1833; cvt.u64.u32 %rd620, %r586; mov.b32 %r587, %f1816; cvt.u64.u32 %rd621, %r587; neg.f32 %f1834, %f1814; mov.b32 %r588, %f1834; mul.f32 %f1835, %f1792, %f1816; mov.b32 %r589, %f1835; cvt.u64.u32 %rd622, %r589; cvt.u64.u32 %rd623, %r588; bfi.b64 %rd624, %rd622, %rd623, 32, 32; mov.b64 {%r590, %r591}, %rd624; bfi.b64 %rd625, %rd620, %rd621, 32, 32; mov.b64 {%r592, %r593}, %rd625; ld.global.f32 %f1836, [%rd67+40]; sub.f32 %f1837, %f5193, %f1836; max.f32 %f1838, %f126, %f1837; ld.global.f32 %f1839, [%rd67+44]; add.f32 %f1840, %f1839, 0f3F800000; min.f32 %f1841, %f1838, %f1840; max.f32 %f1842, %f1794, %f1837; min.f32 %f1843, %f1842, %f1840; mul.f32 %f1844, %f1841, %f1843; mul.f32 %f1845, %f126, %f1794; div.rn.f32 %f1846, %f1845, %f1844; mul.f32 %f5532, %f5532, %f1846; sub.f32 %f1847, %f5193, %f5532; ld.global.f32 %f1848, [%rd67+48]; mul.f32 %f1849, %f1848, %f1847; mov.f32 %f1850, 0f3F000000; mov.f32 %f1851, 0f3BBB989D; fma.rn.f32 %f1852, %f1849, %f1851, %f1850; mov.f32 %f1854, 0f437C0000; cvt.sat.f32.f32 %f1855, %f1852; mov.f32 %f1856, 0f4B400001; fma.rm.f32 %f1857, %f1855, %f1854, %f1856; add.f32 %f1858, %f1857, 0fCB40007F; neg.f32 %f1859, %f1858; fma.rn.f32 %f1860, %f1849, %f5395, %f1859; mov.f32 %f1861, 0f32A57060; fma.rn.f32 %f1862, %f1849, %f1861, %f1860; mov.b32 %r594, %f1857; shl.b32 %r595, %r594, 23; mov.b32 %f1863, %r595; ex2.approx.ftz.f32 %f1864, %f1862; mul.f32 %f1865, %f1864, %f1863; st.f32 [%rd95], %f1865; mov.b32 %f1866, %r584; mul.f32 %f1867, %f1841, %f1866; mov.b32 %f1868, %r585; mul.f32 %f1869, %f1841, %f1868; mov.b32 %f1870, %r582; mul.f32 %f1871, %f1843, %f1870; mov.b32 %f1872, %r583; mul.f32 %f1873, %f1843, %f1872; mov.b32 %f1874, %r592; mov.b32 %f1875, %r593; mul.f32 %f1876, %f1875, %f1871; mul.f32 %f1877, %f1875, %f1873; fma.rn.f32 %f5531, %f1874, %f1869, %f1877; mov.b32 %f1878, %r590; mov.b32 %f1879, %r591; mul.f32 %f1880, %f1879, %f1871; fma.rn.f32 %f5530, %f1878, %f1867, %f1880; mul.f32 %f1881, %f1879, %f1873; fma.rn.f32 %f1882, %f1878, %f1869, %f1881; fma.rn.f32 %f1883, %f1874, %f1867, %f1876; st.local.v4.f32 [%rd522], {%f1883, %f5531, %f5530, %f1882}; bra.uni $L__BB0_341; $L__BB0_394: mov.b32 %r1212, %f5548; xor.b32 %r1213, %r1212, -2147483648; mov.b32 %f3528, %r1213; selp.f32 %f5550, %f3528, %f5548, %p12; setp.geu.f32 %p533, %f498, 0f00000000; @%p533 bra $L__BB0_398; cvt.rn.f32.s32 %f5315, %r218; cvt.rzi.f32.f32 %f3529, %f5315; setp.eq.f32 %p534, %f3529, %f5315; @%p534 bra $L__BB0_398; mov.f32 %f5550, 0f7FFFFFFF; $L__BB0_398: cvt.rn.f32.s32 %f5239, %r218; abs.f32 %f5238, %f5239; abs.f32 %f5237, %f498; add.f32 %f3532, %f5237, %f5238; mov.b32 %r1218, %f3532; setp.lt.s32 %p537, %r1218, 2139095040; @%p537 bra $L__BB0_405; cvt.rn.f32.s32 %f5309, %r218; abs.f32 %f5308, %f5309; abs.f32 %f5307, %f498; setp.gtu.f32 %p538, %f5307, 0f7F800000; setp.gtu.f32 %p539, %f5308, 0f7F800000; or.pred %p540, %p538, %p539; @%p540 bra $L__BB0_404; bra.uni $L__BB0_400; $L__BB0_404: cvt.rn.f32.s32 %f5314, %r218; add.f32 %f5550, %f498, %f5314; bra.uni $L__BB0_405; $L__BB0_411: mov.b32 %r1241, %f5555; xor.b32 %r1242, %r1241, -2147483648; mov.b32 %f3662, %r1242; selp.f32 %f5557, %f3662, %f5555, %p13; setp.geu.f32 %p559, %f532, 0f00000000; @%p559 bra $L__BB0_415; mov.f32 %f5333, 0fBF800000; cvt.rzi.f32.f32 %f3664, %f5333; setp.eq.f32 %p560, %f3664, 0fBF800000; @%p560 bra $L__BB0_415; mov.f32 %f5557, 0f7FFFFFFF; $L__BB0_415: add.f32 %f3668, %f539, 0f3F800000; mov.b32 %r1245, %f3668; setp.lt.s32 %p562, %r1245, 2139095040; @%p562 bra $L__BB0_420; setp.gtu.f32 %p563, %f539, 0f7F800000; @%p563 bra $L__BB0_419; bra.uni $L__BB0_417; $L__BB0_419: add.f32 %f5557, %f532, 0fBF800000; bra.uni $L__BB0_420; $L__BB0_275: setp.neu.f32 %p384, %f328, 0f7F800000; @%p384 bra $L__BB0_278; selp.f32 %f5514, 0fFF800000, 0f7F800000, %p10; $L__BB0_278: ld.global.u8 %rs24, [%rd67+48]; setp.eq.s16 %p385, %rs24, 0; @%p385 bra $L__BB0_282; mov.f32 %f5224, 0fBF000000; div.rn.f32 %f2778, %f255, %f326; setp.lt.f32 %p386, %f2778, 0f00800000; mul.f32 %f2779, %f2778, 0f4B000000; selp.f32 %f339, %f2779, %f2778, %p386; selp.f32 %f2780, 0fC1B80000, 0f00000000, %p386; mov.b32 %r932, %f339; add.s32 %r933, %r932, -1059760811; and.b32 %r934, %r933, -8388608; sub.s32 %r935, %r932, %r934; mov.b32 %f2781, %r935; cvt.rn.f32.s32 %f2782, %r934; mov.f32 %f2783, 0f34000000; fma.rn.f32 %f2784, %f2782, %f2783, %f2780; add.f32 %f2785, %f2781, 0fBF800000; mov.f32 %f2786, 0f3E1039F6; mov.f32 %f2787, 0fBE055027; fma.rn.f32 %f2788, %f2787, %f2785, %f2786; mov.f32 %f2789, 0fBDF8CDCC; fma.rn.f32 %f2790, %f2788, %f2785, %f2789; mov.f32 %f2791, 0f3E0F2955; fma.rn.f32 %f2792, %f2790, %f2785, %f2791; mov.f32 %f2793, 0fBE2AD8B9; fma.rn.f32 %f2794, %f2792, %f2785, %f2793; mov.f32 %f2795, 0f3E4CED0B; fma.rn.f32 %f2796, %f2794, %f2785, %f2795; mov.f32 %f2797, 0fBE7FFF22; fma.rn.f32 %f2798, %f2796, %f2785, %f2797; mov.f32 %f2799, 0f3EAAAA78; fma.rn.f32 %f2800, %f2798, %f2785, %f2799; fma.rn.f32 %f2802, %f2800, %f2785, %f5224; mul.f32 %f2803, %f2785, %f2802; fma.rn.f32 %f2804, %f2803, %f2785, %f2785; mov.f32 %f2805, 0f3F317218; fma.rn.f32 %f5515, %f2784, %f2805, %f2804; setp.lt.u32 %p387, %r932, 2139095040; @%p387 bra $L__BB0_281; mov.f32 %f2806, 0f7F800000; fma.rn.f32 %f5515, %f339, %f2806, %f2806; $L__BB0_281: setp.eq.f32 %p388, %f339, 0f00000000; selp.f32 %f2807, 0fFF800000, %f5515, %p388; add.f32 %f5517, %f5517, %f2807; $L__BB0_282: setp.eq.f32 %p389, %f326, 0f3F800000; selp.f32 %f2808, 0f3F800000, %f5514, %p389; mov.b64 {%r936, %r937}, %rd145; mov.b64 {%r938, %r939}, %rd144; mov.b32 %f2809, %r938; mul.f32 %f2810, %f2809, %f2808; mov.b32 %f2811, %r939; mul.f32 %f2812, %f2811, %f2808; mov.b32 %f2813, %r936; mul.f32 %f2814, %f2813, %f2808; mov.b32 %f2815, %r937; mul.f32 %f2816, %f2815, %f2808; mov.b64 {%r940, %r941}, %rd147; mov.b64 {%r942, %r943}, %rd146; mov.b32 %f2817, %r942; mov.b32 %f2818, %r943; mul.f32 %f2819, %f2818, %f2814; mul.f32 %f2820, %f2818, %f2816; mov.b32 %f2821, %r940; mov.b32 %f2822, %r941; mul.f32 %f2823, %f2822, %f2814; mul.f32 %f2824, %f2822, %f2816; fma.rn.f32 %f2825, %f2817, %f2812, %f2820; mov.b32 %r944, %f2825; fma.rn.f32 %f2826, %f2817, %f2810, %f2819; mov.b32 %r945, %f2826; fma.rn.f32 %f2827, %f2821, %f2812, %f2824; mov.b32 %r946, %f2827; fma.rn.f32 %f2828, %f2821, %f2810, %f2823; mov.b32 %r947, %f2828; mov.b64 %rd1344, {%r947, %r946}; mov.b64 %rd1343, {%r945, %r944}; bra.uni $L__BB0_283; $L__BB0_287: setp.lt.s32 %p400, %r141, 0; min.f32 %f2851, %f354, %f353; max.f32 %f2852, %f354, %f353; div.rn.f32 %f2853, %f2851, %f2852; mul.rn.f32 %f2854, %f2853, %f2853; mov.f32 %f2855, 0fC0B59883; mov.f32 %f2856, 0fBF52C7EA; fma.rn.f32 %f2857, %f2854, %f2856, %f2855; mov.f32 %f2858, 0fC0D21907; fma.rn.f32 %f2859, %f2857, %f2854, %f2858; mul.f32 %f2860, %f2854, %f2859; mul.f32 %f2861, %f2853, %f2860; add.f32 %f2862, %f2854, 0f41355DC0; mov.f32 %f2863, 0f41E6BD60; fma.rn.f32 %f2864, %f2862, %f2854, %f2863; mov.f32 %f2865, 0f419D92C8; fma.rn.f32 %f2866, %f2864, %f2854, %f2865; rcp.rn.f32 %f2867, %f2866; fma.rn.f32 %f2868, %f2861, %f2867, %f2853; mov.f32 %f2869, 0f3FC90FDB; sub.f32 %f2870, %f2869, %f2868; setp.gt.f32 %p401, %f354, %f353; selp.f32 %f2871, %f2870, %f2868, %p401; mov.f32 %f2872, 0f40490FDB; sub.f32 %f2873, %f2872, %f2871; selp.f32 %f2874, %f2873, %f2871, %p400; mov.b32 %r953, %f2874; or.b32 %r954, %r142, %r953; mov.b32 %f2875, %r954; add.f32 %f2876, %f353, %f354; setp.le.f32 %p402, %f2876, 0f7F800000; selp.f32 %f5518, %f2875, %f2876, %p402; $L__BB0_290: abs.f32 %f359, %f348; setp.eq.f32 %p404, %f359, 0f00000000; abs.f32 %f360, %f349; setp.eq.f32 %p405, %f360, 0f00000000; and.pred %p406, %p404, %p405; mov.b32 %r143, %f348; mov.b32 %r960, %f349; and.b32 %r144, %r960, -2147483648; @%p406 bra $L__BB0_294; bra.uni $L__BB0_291; $L__BB0_294: shr.s32 %r965, %r143, 31; and.b32 %r966, %r965, 1078530011; or.b32 %r967, %r966, %r144; mov.b32 %f5519, %r967; bra.uni $L__BB0_295; $L__BB0_291: setp.eq.f32 %p407, %f359, 0f7F800000; setp.eq.f32 %p408, %f360, 0f7F800000; and.pred %p409, %p407, %p408; @%p409 bra $L__BB0_293; bra.uni $L__BB0_292; $L__BB0_293: setp.lt.s32 %p413, %r143, 0; selp.b32 %r963, 1075235812, 1061752795, %p413; or.b32 %r964, %r963, %r144; mov.b32 %f5519, %r964; bra.uni $L__BB0_295; $L__BB0_292: setp.lt.s32 %p410, %r143, 0; min.f32 %f2877, %f360, %f359; max.f32 %f2878, %f360, %f359; div.rn.f32 %f2879, %f2877, %f2878; mul.rn.f32 %f2880, %f2879, %f2879; mov.f32 %f2881, 0fC0B59883; mov.f32 %f2882, 0fBF52C7EA; fma.rn.f32 %f2883, %f2880, %f2882, %f2881; mov.f32 %f2884, 0fC0D21907; fma.rn.f32 %f2885, %f2883, %f2880, %f2884; mul.f32 %f2886, %f2880, %f2885; mul.f32 %f2887, %f2879, %f2886; add.f32 %f2888, %f2880, 0f41355DC0; mov.f32 %f2889, 0f41E6BD60; fma.rn.f32 %f2890, %f2888, %f2880, %f2889; mov.f32 %f2891, 0f419D92C8; fma.rn.f32 %f2892, %f2890, %f2880, %f2891; rcp.rn.f32 %f2893, %f2892; fma.rn.f32 %f2894, %f2887, %f2893, %f2879; mov.f32 %f2895, 0f3FC90FDB; sub.f32 %f2896, %f2895, %f2894; setp.gt.f32 %p411, %f360, %f359; selp.f32 %f2897, %f2896, %f2894, %p411; mov.f32 %f2898, 0f40490FDB; sub.f32 %f2899, %f2898, %f2897; selp.f32 %f2900, %f2899, %f2897, %p410; mov.b32 %r961, %f2900; or.b32 %r962, %r144, %r961; mov.b32 %f2901, %r962; add.f32 %f2902, %f359, %f360; setp.le.f32 %p412, %f2902, 0f7F800000; selp.f32 %f5519, %f2901, %f2902, %p412; $L__BB0_295: sub.f32 %f2903, %f5519, %f5518; mul.f32 %f365, %f2903, 0f3F000000; add.f32 %f2904, %f5518, %f5519; mul.f32 %f366, %f2904, 0f3F000000; mul.f32 %f2905, %f365, 0f3F22F983; cvt.rni.s32.f32 %r1757, %f2905; cvt.rn.f32.s32 %f2906, %r1757; mov.f32 %f2907, 0fBFC90FDA; fma.rn.f32 %f2908, %f2906, %f2907, %f365; mov.f32 %f2909, 0fB3A22168; fma.rn.f32 %f2910, %f2906, %f2909, %f2908; mov.f32 %f2911, 0fA7C234C5; fma.rn.f32 %f5520, %f2906, %f2911, %f2910; abs.f32 %f368, %f365; setp.leu.f32 %p414, %f368, 0f47CE4780; @%p414 bra $L__BB0_303; setp.eq.f32 %p415, %f368, 0f7F800000; @%p415 bra $L__BB0_302; bra.uni $L__BB0_297; $L__BB0_302: mul.rn.f32 %f5520, %f365, %f1058; bra.uni $L__BB0_303; $L__BB0_400: cvt.rn.f32.s32 %f5311, %r218; abs.f32 %f5310, %f5311; setp.eq.f32 %p541, %f5310, 0f7F800000; @%p541 bra $L__BB0_403; bra.uni $L__BB0_401; $L__BB0_403: abs.f32 %f5313, %f498; setp.gt.f32 %p544, %f5313, 0f3F800000; selp.b32 %r1222, 2139095040, 0, %p544; xor.b32 %r1223, %r1222, 2139095040; setp.lt.s32 %p545, %r218, 0; selp.b32 %r1224, %r1223, %r1222, %p545; mov.b32 %f3533, %r1224; setp.eq.f32 %p546, %f498, 0fBF800000; selp.f32 %f5550, 0f3F800000, %f3533, %p546; bra.uni $L__BB0_405; $L__BB0_417: setp.neu.f32 %p564, %f539, 0f7F800000; @%p564 bra $L__BB0_420; selp.f32 %f5557, 0f80000000, 0f00000000, %p13; $L__BB0_420: mov.f32 %f5328, 0f00000000; mul.f32 %f5327, %f5530, %f5635; fma.rn.f32 %f5326, %f5546, %f5531, %f5327; mul.f32 %f5325, %f5635, %f5635; fma.rn.f32 %f5324, %f5531, %f5531, %f5325; mul.f32 %f5323, %f5530, %f5530; fma.rn.f32 %f5322, %f5546, %f5546, %f5323; add.u64 %rd1288, %SPL, 0; setp.eq.f32 %p565, %f532, 0f3F800000; mov.u32 %r1246, 1065353216; selp.f32 %f3669, 0f3F800000, %f5557, %p565; mul.f32 %f3670, %f537, %f3669; add.f32 %f3671, %f5322, 0f00000000; add.f32 %f3672, %f3671, %f5324; mul.f32 %f3673, %f3672, 0f3F000000; sub.f32 %f3674, %f5322, %f3673; sub.f32 %f3675, %f5324, %f3673; mul.f32 %f5558, %f3674, %f3670; mul.f32 %f5559, %f5326, %f3670; mul.f32 %f5561, %f3675, %f3670; fma.rn.f32 %f3676, %f532, %f532, 0fBF800000; mul.f32 %f3677, %f533, 0f3F000000; mul.f32 %f3678, %f3676, %f3677; add.u64 %rd872, %SPL, 32; st.local.v4.f32 [%rd872], {%f5328, %f5328, %f5328, %f5328}; mov.u64 %rd873, 0; st.local.v2.u64 [%rd1288], {%rd873, %rd873}; st.local.u32 [%rd1288], %r1246; st.local.u32 [%rd1288+12], %r1246; ld.local.v4.f32 {%f3680, %f3681, %f3682, %f3683}, [%rd1288]; mul.f32 %f5562, %f3678, %f3680; mul.f32 %f5563, %f3678, %f3681; mul.f32 %f5564, %f3678, %f3682; mul.f32 %f5565, %f3678, %f3683; setp.ltu.f32 %p566, %f532, 0f3F800000; mov.f32 %f5560, %f5559; @%p566 bra $L__BB0_422; mov.f32 %f5329, 0f00000000; add.f32 %f5558, %f5558, %f5562; add.f32 %f566, %f5559, %f5563; add.f32 %f5560, %f5559, %f5564; add.f32 %f5561, %f5561, %f5565; st.local.v4.f32 [%rd872], {%f5329, %f5329, %f5329, %f5329}; mov.f32 %f5559, %f566; mov.f32 %f5562, %f5329; mov.f32 %f5563, %f5329; mov.f32 %f5564, %f5329; mov.f32 %f5565, %f5329; $L__BB0_422: mov.b32 %f5332, %r10; mul.f32 %f5331, %f5332, 0f3F7FBE77; fma.rn.f32 %f5330, %f5331, %f5332, 0f3A83126F; fma.rn.f32 %f3692, %f5330, %f5560, %f5564; mov.b32 %r1247, %f3692; fma.rn.f32 %f3693, %f5330, %f5561, %f5565; mov.b32 %r1248, %f3693; fma.rn.f32 %f3694, %f5330, %f5558, %f5562; mov.b32 %r1249, %f3694; fma.rn.f32 %f3695, %f5330, %f5559, %f5563; mov.b32 %r1250, %f3695; mov.b64 %rd1375, {%r1249, %r1250}; mov.b64 %rd1376, {%r1247, %r1248}; bra.uni $L__BB0_455; $L__BB0_425: setp.lt.s32 %p573, %r219, 0; min.f32 %f3716, %f585, %f584; max.f32 %f3717, %f585, %f584; div.rn.f32 %f3718, %f3716, %f3717; mul.rn.f32 %f3719, %f3718, %f3718; mov.f32 %f3720, 0fC0B59883; mov.f32 %f3721, 0fBF52C7EA; fma.rn.f32 %f3722, %f3719, %f3721, %f3720; mov.f32 %f3723, 0fC0D21907; fma.rn.f32 %f3724, %f3722, %f3719, %f3723; mul.f32 %f3725, %f3719, %f3724; mul.f32 %f3726, %f3718, %f3725; add.f32 %f3727, %f3719, 0f41355DC0; mov.f32 %f3728, 0f41E6BD60; fma.rn.f32 %f3729, %f3727, %f3719, %f3728; mov.f32 %f3730, 0f419D92C8; fma.rn.f32 %f3731, %f3729, %f3719, %f3730; rcp.rn.f32 %f3732, %f3731; fma.rn.f32 %f3733, %f3726, %f3732, %f3718; mov.f32 %f3734, 0f3FC90FDB; sub.f32 %f3735, %f3734, %f3733; setp.gt.f32 %p574, %f585, %f584; selp.f32 %f3736, %f3735, %f3733, %p574; mov.f32 %f3737, 0f40490FDB; sub.f32 %f3738, %f3737, %f3736; selp.f32 %f3739, %f3738, %f3736, %p573; mov.b32 %r1252, %f3739; or.b32 %r1253, %r220, %r1252; mov.b32 %f3740, %r1253; add.f32 %f3741, %f584, %f585; setp.le.f32 %p575, %f3741, 0f7F800000; selp.f32 %f5566, %f3740, %f3741, %p575; $L__BB0_428: abs.f32 %f590, %f580; setp.eq.f32 %p577, %f590, 0f00000000; abs.f32 %f591, %f581; setp.eq.f32 %p578, %f591, 0f00000000; and.pred %p579, %p577, %p578; mov.b32 %r221, %f580; mov.b32 %r1259, %f581; and.b32 %r222, %r1259, -2147483648; @%p579 bra $L__BB0_432; bra.uni $L__BB0_429; $L__BB0_432: shr.s32 %r1264, %r221, 31; and.b32 %r1265, %r1264, 1078530011; or.b32 %r1266, %r1265, %r222; mov.b32 %f5567, %r1266; bra.uni $L__BB0_433; $L__BB0_429: setp.eq.f32 %p580, %f590, 0f7F800000; setp.eq.f32 %p581, %f591, 0f7F800000; and.pred %p582, %p580, %p581; @%p582 bra $L__BB0_431; bra.uni $L__BB0_430; $L__BB0_431: setp.lt.s32 %p586, %r221, 0; selp.b32 %r1262, 1075235812, 1061752795, %p586; or.b32 %r1263, %r1262, %r222; mov.b32 %f5567, %r1263; bra.uni $L__BB0_433; $L__BB0_430: setp.lt.s32 %p583, %r221, 0; min.f32 %f3742, %f591, %f590; max.f32 %f3743, %f591, %f590; div.rn.f32 %f3744, %f3742, %f3743; mul.rn.f32 %f3745, %f3744, %f3744; mov.f32 %f3746, 0fC0B59883; mov.f32 %f3747, 0fBF52C7EA; fma.rn.f32 %f3748, %f3745, %f3747, %f3746; mov.f32 %f3749, 0fC0D21907; fma.rn.f32 %f3750, %f3748, %f3745, %f3749; mul.f32 %f3751, %f3745, %f3750; mul.f32 %f3752, %f3744, %f3751; add.f32 %f3753, %f3745, 0f41355DC0; mov.f32 %f3754, 0f41E6BD60; fma.rn.f32 %f3755, %f3753, %f3745, %f3754; mov.f32 %f3756, 0f419D92C8; fma.rn.f32 %f3757, %f3755, %f3745, %f3756; rcp.rn.f32 %f3758, %f3757; fma.rn.f32 %f3759, %f3752, %f3758, %f3744; mov.f32 %f3760, 0f3FC90FDB; sub.f32 %f3761, %f3760, %f3759; setp.gt.f32 %p584, %f591, %f590; selp.f32 %f3762, %f3761, %f3759, %p584; mov.f32 %f3763, 0f40490FDB; sub.f32 %f3764, %f3763, %f3762; selp.f32 %f3765, %f3764, %f3762, %p583; mov.b32 %r1260, %f3765; or.b32 %r1261, %r222, %r1260; mov.b32 %f3766, %r1261; add.f32 %f3767, %f590, %f591; setp.le.f32 %p585, %f3767, 0f7F800000; selp.f32 %f5567, %f3766, %f3767, %p585; $L__BB0_433: sub.f32 %f3768, %f5567, %f5566; mul.f32 %f596, %f3768, 0f3F000000; add.f32 %f3769, %f5566, %f5567; mul.f32 %f597, %f3769, 0f3F000000; mul.f32 %f3770, %f596, 0f3F22F983; cvt.rni.s32.f32 %r1776, %f3770; cvt.rn.f32.s32 %f3771, %r1776; mov.f32 %f3772, 0fBFC90FDA; fma.rn.f32 %f3773, %f3771, %f3772, %f596; mov.f32 %f3774, 0fB3A22168; fma.rn.f32 %f3775, %f3771, %f3774, %f3773; mov.f32 %f3776, 0fA7C234C5; fma.rn.f32 %f5568, %f3771, %f3776, %f3775; abs.f32 %f599, %f596; setp.leu.f32 %p587, %f599, 0f47CE4780; @%p587 bra $L__BB0_441; setp.eq.f32 %p588, %f599, 0f7F800000; @%p588 bra $L__BB0_440; bra.uni $L__BB0_435; $L__BB0_440: mov.f32 %f3779, 0f00000000; mul.rn.f32 %f5568, %f596, %f3779; bra.uni $L__BB0_441; $L__BB0_252: mov.b32 %r899, %f5507; xor.b32 %r900, %r899, -2147483648; mov.b32 %f2632, %r900; selp.f32 %f5509, %f2632, %f5507, %p9; setp.geu.f32 %p360, %f307, 0f00000000; @%p360 bra $L__BB0_256; mov.f32 %f5236, 0f3F000000; cvt.rzi.f32.f32 %f2634, %f5236; setp.eq.f32 %p361, %f2634, 0f3F000000; @%p361 bra $L__BB0_256; mov.f32 %f5509, 0f7FFFFFFF; $L__BB0_256: abs.f32 %f5233, %f307; add.f32 %f2637, %f5233, 0f3F000000; mov.b32 %r901, %f2637; setp.lt.s32 %p363, %r901, 2139095040; @%p363 bra $L__BB0_261; abs.f32 %f5234, %f307; setp.gtu.f32 %p364, %f5234, 0f7F800000; @%p364 bra $L__BB0_260; bra.uni $L__BB0_258; $L__BB0_260: add.f32 %f5509, %f307, 0f3F000000; bra.uni $L__BB0_261; $L__BB0_297: mov.b32 %r146, %f365; bfe.u32 %r970, %r146, 23, 8; add.s32 %r147, %r970, -128; shl.b32 %r971, %r146, 8; or.b32 %r148, %r971, -2147483648; shr.u32 %r149, %r147, 5; add.u64 %rd757, %SP, 32; add.u64 %rd1346, %SPL, 32; mov.u32 %r1753, 0; mov.u64 %rd1345, __cudart_i2opi_f; mov.u32 %r1754, %r1753; $L__BB0_298: .pragma "nounroll"; mov.u32 %r151, %r1754; ld.global.nc.u32 %r974, [%rd1345]; // begin inline asm { mad.lo.cc.u32 %r972, %r974, %r148, %r151; madc.hi.u32 %r1754, %r974, %r148, 0; } // end inline asm st.local.u32 [%rd1346], %r972; add.s64 %rd1346, %rd1346, 4; add.s64 %rd1345, %rd1345, 4; add.s32 %r1753, %r1753, 1; setp.ne.s32 %p416, %r1753, 6; @%p416 bra $L__BB0_298; mov.u32 %r979, -1560706194; // begin inline asm { mad.lo.cc.u32 %r977, %r979, %r148, %r151; madc.hi.u32 %r978, %r979, %r148, 0; } // end inline asm st.local.u32 [%rd94], %r978; mov.u32 %r982, 4; sub.s32 %r154, %r982, %r149; mov.u32 %r983, 6; sub.s32 %r984, %r983, %r149; cvta.to.local.u64 %rd759, %rd757; mul.wide.s32 %rd760, %r984, 4; add.s64 %rd761, %rd759, %rd760; ld.local.u32 %r1755, [%rd761]; ld.local.u32 %r1756, [%rd761+-4]; and.b32 %r157, %r147, 31; setp.eq.s32 %p417, %r157, 0; @%p417 bra $L__BB0_301; mov.u32 %r985, 32; sub.s32 %r986, %r985, %r157; shr.u32 %r987, %r1756, %r986; shl.b32 %r988, %r1755, %r157; add.s32 %r1755, %r987, %r988; mul.wide.s32 %rd764, %r154, 4; add.s64 %rd765, %rd759, %rd764; ld.local.u32 %r989, [%rd765]; shr.u32 %r990, %r989, %r986; shl.b32 %r991, %r1756, %r157; add.s32 %r1756, %r990, %r991; $L__BB0_301: and.b32 %r992, %r146, -2147483648; shr.u32 %r993, %r1756, 30; shl.b32 %r994, %r1755, 2; or.b32 %r995, %r993, %r994; shr.u32 %r996, %r995, 31; shr.u32 %r997, %r1755, 30; add.s32 %r998, %r996, %r997; neg.s32 %r999, %r998; setp.eq.s32 %p418, %r992, 0; selp.b32 %r1757, %r998, %r999, %p418; setp.ne.s32 %p419, %r996, 0; xor.b32 %r1000, %r992, -2147483648; selp.b32 %r1001, %r1000, %r992, %p419; selp.b32 %r1002, -1, 0, %p419; xor.b32 %r1003, %r995, %r1002; shl.b32 %r1004, %r1756, 2; xor.b32 %r1005, %r1004, %r1002; cvt.u64.u32 %rd766, %r1003; cvt.u64.u32 %rd767, %r1005; bfi.b64 %rd768, %rd766, %rd767, 32, 32; cvt.rn.f64.s64 %fd13, %rd768; mul.f64 %fd14, %fd13, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f2912, %fd14; setp.eq.s32 %p420, %r1001, 0; neg.f32 %f2913, %f2912; selp.f32 %f5520, %f2912, %f2913, %p420; $L__BB0_303: mul.f32 %f2915, %f366, 0f3F22F983; cvt.rni.s32.f32 %r1762, %f2915; cvt.rn.f32.s32 %f2916, %r1762; fma.rn.f32 %f2918, %f2916, %f2907, %f366; fma.rn.f32 %f2920, %f2916, %f2909, %f2918; fma.rn.f32 %f5521, %f2916, %f2911, %f2920; abs.f32 %f373, %f366; setp.leu.f32 %p421, %f373, 0f47CE4780; @%p421 bra $L__BB0_311; setp.eq.f32 %p422, %f373, 0f7F800000; @%p422 bra $L__BB0_310; bra.uni $L__BB0_305; $L__BB0_310: mul.rn.f32 %f5521, %f366, %f1058; bra.uni $L__BB0_311; $L__BB0_305: mov.b32 %r165, %f366; bfe.u32 %r1008, %r165, 23, 8; add.s32 %r166, %r1008, -128; shl.b32 %r1009, %r165, 8; or.b32 %r167, %r1009, -2147483648; shr.u32 %r168, %r166, 5; add.u64 %rd770, %SP, 32; add.u64 %rd1348, %SPL, 32; mov.u32 %r1758, 0; mov.u64 %rd1347, __cudart_i2opi_f; mov.u32 %r1759, %r1758; $L__BB0_306: .pragma "nounroll"; mov.u32 %r170, %r1759; ld.global.nc.u32 %r1012, [%rd1347]; // begin inline asm { mad.lo.cc.u32 %r1010, %r1012, %r167, %r170; madc.hi.u32 %r1759, %r1012, %r167, 0; } // end inline asm st.local.u32 [%rd1348], %r1010; add.s64 %rd1348, %rd1348, 4; add.s64 %rd1347, %rd1347, 4; add.s32 %r1758, %r1758, 1; setp.ne.s32 %p423, %r1758, 6; @%p423 bra $L__BB0_306; mov.u32 %r1017, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1015, %r1017, %r167, %r170; madc.hi.u32 %r1016, %r1017, %r167, 0; } // end inline asm st.local.u32 [%rd94], %r1016; mov.u32 %r1020, 4; sub.s32 %r173, %r1020, %r168; mov.u32 %r1021, 6; sub.s32 %r1022, %r1021, %r168; cvta.to.local.u64 %rd772, %rd770; mul.wide.s32 %rd773, %r1022, 4; add.s64 %rd774, %rd772, %rd773; ld.local.u32 %r1760, [%rd774]; ld.local.u32 %r1761, [%rd774+-4]; and.b32 %r176, %r166, 31; setp.eq.s32 %p424, %r176, 0; @%p424 bra $L__BB0_309; mov.u32 %r1023, 32; sub.s32 %r1024, %r1023, %r176; shr.u32 %r1025, %r1761, %r1024; shl.b32 %r1026, %r1760, %r176; add.s32 %r1760, %r1025, %r1026; mul.wide.s32 %rd777, %r173, 4; add.s64 %rd778, %rd772, %rd777; ld.local.u32 %r1027, [%rd778]; shr.u32 %r1028, %r1027, %r1024; shl.b32 %r1029, %r1761, %r176; add.s32 %r1761, %r1028, %r1029; $L__BB0_309: and.b32 %r1030, %r165, -2147483648; shr.u32 %r1031, %r1761, 30; shl.b32 %r1032, %r1760, 2; or.b32 %r1033, %r1031, %r1032; shr.u32 %r1034, %r1033, 31; shr.u32 %r1035, %r1760, 30; add.s32 %r1036, %r1034, %r1035; neg.s32 %r1037, %r1036; setp.eq.s32 %p425, %r1030, 0; selp.b32 %r1762, %r1036, %r1037, %p425; setp.ne.s32 %p426, %r1034, 0; xor.b32 %r1038, %r1030, -2147483648; selp.b32 %r1039, %r1038, %r1030, %p426; selp.b32 %r1040, -1, 0, %p426; xor.b32 %r1041, %r1033, %r1040; shl.b32 %r1042, %r1761, 2; xor.b32 %r1043, %r1042, %r1040; cvt.u64.u32 %rd779, %r1041; cvt.u64.u32 %rd780, %r1043; bfi.b64 %rd781, %rd779, %rd780, 32, 32; cvt.rn.f64.s64 %fd15, %rd781; mul.f64 %fd16, %fd15, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f2922, %fd16; setp.eq.s32 %p427, %r1039, 0; neg.f32 %f2923, %f2922; selp.f32 %f5521, %f2922, %f2923, %p427; $L__BB0_311: mov.f32 %f5393, 0f3FB8AA3B; mov.f32 %f5192, 0f3F800000; mul.f32 %f2925, %f5520, %f5520; mov.f32 %f2926, 0fBAB607ED; mov.f32 %f2927, 0f37CBAC00; fma.rn.f32 %f2928, %f2927, %f2925, %f2926; mov.f32 %f2929, 0f3D2AAABB; fma.rn.f32 %f2930, %f2928, %f2925, %f2929; mov.f32 %f2931, 0fBEFFFFFF; fma.rn.f32 %f2932, %f2930, %f2925, %f2931; fma.rn.f32 %f2934, %f2932, %f2925, %f5192; mov.f32 %f2935, 0f3C0885E4; mov.f32 %f5523, 0fB94D4153; fma.rn.f32 %f2937, %f5523, %f2925, %f2935; mov.f32 %f2938, 0fBE2AAAA8; fma.rn.f32 %f2939, %f2937, %f2925, %f2938; fma.rn.f32 %f2941, %f2925, %f5520, %f1058; fma.rn.f32 %f2942, %f2939, %f2941, %f5520; and.b32 %r1044, %r1757, 1; setp.eq.b32 %p428, %r1044, 1; selp.f32 %f2943, %f2934, %f2942, %p428; selp.f32 %f2944, %f2942, %f2934, %p428; neg.f32 %f2945, %f2943; and.b32 %r1045, %r1757, 2; setp.eq.s32 %p429, %r1045, 0; selp.f32 %f2946, %f2943, %f2945, %p429; neg.f32 %f2947, %f2944; add.s32 %r1046, %r1757, 1; and.b32 %r1047, %r1046, 2; setp.eq.s32 %p430, %r1047, 0; selp.f32 %f2948, %f2944, %f2947, %p430; mul.f32 %f2949, %f5521, %f5521; fma.rn.f32 %f2950, %f2927, %f2949, %f2926; fma.rn.f32 %f2951, %f2950, %f2949, %f2929; fma.rn.f32 %f2952, %f2951, %f2949, %f2931; fma.rn.f32 %f2953, %f2952, %f2949, %f5192; fma.rn.f32 %f2954, %f2949, %f5521, %f1058; fma.rn.f32 %f2955, %f5523, %f2949, %f2935; fma.rn.f32 %f2956, %f2955, %f2949, %f2938; fma.rn.f32 %f2957, %f2956, %f2954, %f5521; and.b32 %r1048, %r1762, 1; setp.eq.b32 %p431, %r1048, 1; selp.f32 %f2958, %f2953, %f2957, %p431; selp.f32 %f2959, %f2957, %f2953, %p431; and.b32 %r1049, %r1762, 2; setp.eq.s32 %p432, %r1049, 0; neg.f32 %f2960, %f2958; selp.f32 %f2961, %f2958, %f2960, %p432; add.s32 %r1050, %r1762, 1; and.b32 %r1051, %r1050, 2; setp.eq.s32 %p433, %r1051, 0; neg.f32 %f2962, %f2959; selp.f32 %f2963, %f2959, %f2962, %p433; mov.b32 %r1052, %f2963; neg.f32 %f2964, %f2961; mov.b32 %r1053, %f2961; cvt.u64.u32 %rd782, %r1053; cvt.u64.u32 %rd783, %r1052; bfi.b64 %rd167, %rd782, %rd783, 32, 32; mov.b32 %r1054, %f2964; cvt.u64.u32 %rd784, %r1054; bfi.b64 %rd168, %rd783, %rd784, 32, 32; mul.f32 %f2965, %f351, %f2946; mov.b32 %r1055, %f2965; cvt.u64.u32 %rd785, %r1055; mov.b32 %r1056, %f2948; cvt.u64.u32 %rd786, %r1056; bfi.b64 %rd169, %rd785, %rd786, 32, 32; neg.f32 %f2966, %f2946; mov.b32 %r1057, %f2966; mul.f32 %f2967, %f351, %f2948; mov.b32 %r1058, %f2967; cvt.u64.u32 %rd787, %r1058; cvt.u64.u32 %rd788, %r1057; bfi.b64 %rd170, %rd787, %rd788, 32, 32; ld.global.f32 %f2968, [%rd67+44]; ld.f32 %f2969, [%rd156]; mul.f32 %f2970, %f2969, %f2968; ld.global.f32 %f2971, [%rd67+52]; sub.f32 %f2972, %f2970, %f2971; ld.global.f32 %f2973, [%rd67+48]; mul.f32 %f2974, %f2969, %f2973; neg.f32 %f2975, %f2974; mov.f32 %f2976, 0f3F000000; mov.f32 %f2977, 0f3BBB989D; fma.rn.f32 %f2978, %f2975, %f2977, %f2976; mov.f32 %f2980, 0f437C0000; cvt.sat.f32.f32 %f2981, %f2978; mov.f32 %f2982, 0f4B400001; fma.rm.f32 %f2983, %f2981, %f2980, %f2982; add.f32 %f2984, %f2983, 0fCB40007F; neg.f32 %f2985, %f2984; fma.rn.f32 %f2986, %f2975, %f5393, %f2985; mov.f32 %f2987, 0f32A57060; fma.rn.f32 %f2988, %f2975, %f2987, %f2986; mov.b32 %r1059, %f2983; shl.b32 %r1060, %r1059, 23; mov.b32 %f2989, %r1060; ex2.approx.ftz.f32 %f2990, %f2988; mul.f32 %f2991, %f2990, %f2989; ld.global.f32 %f2992, [%rd67+40]; fma.rn.f32 %f377, %f2972, %f2991, %f2992; mul.f32 %f2993, %f377, 0f3F22F983; cvt.rni.s32.f32 %r1767, %f2993; cvt.rn.f32.s32 %f2994, %r1767; fma.rn.f32 %f2996, %f2994, %f2907, %f377; fma.rn.f32 %f2998, %f2994, %f2909, %f2996; fma.rn.f32 %f5522, %f2994, %f2911, %f2998; abs.f32 %f379, %f377; setp.leu.f32 %p434, %f379, 0f47CE4780; @%p434 bra $L__BB0_319; setp.eq.f32 %p435, %f379, 0f7F800000; @%p435 bra $L__BB0_318; bra.uni $L__BB0_313; $L__BB0_318: mul.rn.f32 %f5522, %f377, %f1058; bra.uni $L__BB0_319; $L__BB0_313: mov.b32 %r184, %f377; bfe.u32 %r1063, %r184, 23, 8; add.s32 %r185, %r1063, -128; shl.b32 %r1064, %r184, 8; or.b32 %r186, %r1064, -2147483648; shr.u32 %r187, %r185, 5; add.u64 %rd790, %SP, 32; add.u64 %rd1350, %SPL, 32; mov.u32 %r1763, 0; mov.u64 %rd1349, __cudart_i2opi_f; mov.u32 %r1764, %r1763; $L__BB0_314: .pragma "nounroll"; mov.u32 %r189, %r1764; ld.global.nc.u32 %r1067, [%rd1349]; // begin inline asm { mad.lo.cc.u32 %r1065, %r1067, %r186, %r189; madc.hi.u32 %r1764, %r1067, %r186, 0; } // end inline asm st.local.u32 [%rd1350], %r1065; add.s64 %rd1350, %rd1350, 4; add.s64 %rd1349, %rd1349, 4; add.s32 %r1763, %r1763, 1; setp.ne.s32 %p436, %r1763, 6; @%p436 bra $L__BB0_314; mov.u32 %r1072, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1070, %r1072, %r186, %r189; madc.hi.u32 %r1071, %r1072, %r186, 0; } // end inline asm st.local.u32 [%rd94], %r1071; mov.u32 %r1075, 4; sub.s32 %r192, %r1075, %r187; mov.u32 %r1076, 6; sub.s32 %r1077, %r1076, %r187; cvta.to.local.u64 %rd792, %rd790; mul.wide.s32 %rd793, %r1077, 4; add.s64 %rd794, %rd792, %rd793; ld.local.u32 %r1765, [%rd794]; ld.local.u32 %r1766, [%rd794+-4]; and.b32 %r195, %r185, 31; setp.eq.s32 %p437, %r195, 0; @%p437 bra $L__BB0_317; mov.u32 %r1078, 32; sub.s32 %r1079, %r1078, %r195; shr.u32 %r1080, %r1766, %r1079; shl.b32 %r1081, %r1765, %r195; add.s32 %r1765, %r1080, %r1081; mul.wide.s32 %rd797, %r192, 4; add.s64 %rd798, %rd792, %rd797; ld.local.u32 %r1082, [%rd798]; shr.u32 %r1083, %r1082, %r1079; shl.b32 %r1084, %r1766, %r195; add.s32 %r1766, %r1083, %r1084; $L__BB0_317: and.b32 %r1085, %r184, -2147483648; shr.u32 %r1086, %r1766, 30; shl.b32 %r1087, %r1765, 2; or.b32 %r1088, %r1086, %r1087; shr.u32 %r1089, %r1088, 31; shr.u32 %r1090, %r1765, 30; add.s32 %r1091, %r1089, %r1090; neg.s32 %r1092, %r1091; setp.eq.s32 %p438, %r1085, 0; selp.b32 %r1767, %r1091, %r1092, %p438; setp.ne.s32 %p439, %r1089, 0; xor.b32 %r1093, %r1085, -2147483648; selp.b32 %r1094, %r1093, %r1085, %p439; selp.b32 %r1095, -1, 0, %p439; xor.b32 %r1096, %r1088, %r1095; shl.b32 %r1097, %r1766, 2; xor.b32 %r1098, %r1097, %r1095; cvt.u64.u32 %rd799, %r1096; cvt.u64.u32 %rd800, %r1098; bfi.b64 %rd801, %rd799, %rd800, 32, 32; cvt.rn.f64.s64 %fd17, %rd801; mul.f64 %fd18, %fd17, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f3000, %fd18; setp.eq.s32 %p440, %r1094, 0; neg.f32 %f3001, %f3000; selp.f32 %f5522, %f3000, %f3001, %p440; $L__BB0_319: and.b32 %r202, %r1767, 1; setp.eq.s32 %p441, %r202, 0; selp.f32 %f383, %f5522, 0f3F800000, %p441; mul.rn.f32 %f384, %f5522, %f5522; @%p441 bra $L__BB0_321; fma.rn.f32 %f5523, %f2927, %f384, %f2926; $L__BB0_321: selp.f32 %f3006, 0f3C0885E4, 0f3D2AAABB, %p441; fma.rn.f32 %f3007, %f5523, %f384, %f3006; selp.f32 %f3008, 0fBE2AAAA8, 0fBEFFFFFF, %p441; fma.rn.f32 %f3009, %f3007, %f384, %f3008; fma.rn.f32 %f3011, %f384, %f383, %f1058; fma.rn.f32 %f5524, %f3009, %f3011, %f383; and.b32 %r1099, %r1767, 2; setp.eq.s32 %p443, %r1099, 0; @%p443 bra $L__BB0_323; mov.f32 %f3013, 0fBF800000; fma.rn.f32 %f5524, %f5524, %f3013, %f1058; $L__BB0_323: ld.f32 %f390, [%rd156+8]; mul.f32 %f3014, %f350, 0f4B000000; setp.lt.f32 %p444, %f350, 0f00800000; selp.f32 %f391, %f3014, %f350, %p444; selp.f32 %f3015, 0fC1B80000, 0f00000000, %p444; mov.b32 %r1100, %f391; add.s32 %r1101, %r1100, -1059760811; and.b32 %r1102, %r1101, -8388608; sub.s32 %r1103, %r1100, %r1102; mov.b32 %f3016, %r1103; cvt.rn.f32.s32 %f3017, %r1102; mov.f32 %f3018, 0f34000000; fma.rn.f32 %f3019, %f3017, %f3018, %f3015; add.f32 %f3020, %f3016, 0fBF800000; mov.f32 %f3021, 0f3E1039F6; mov.f32 %f3022, 0fBE055027; fma.rn.f32 %f3023, %f3022, %f3020, %f3021; mov.f32 %f3024, 0fBDF8CDCC; fma.rn.f32 %f3025, %f3023, %f3020, %f3024; mov.f32 %f3026, 0f3E0F2955; fma.rn.f32 %f3027, %f3025, %f3020, %f3026; mov.f32 %f3028, 0fBE2AD8B9; fma.rn.f32 %f3029, %f3027, %f3020, %f3028; mov.f32 %f3030, 0f3E4CED0B; fma.rn.f32 %f3031, %f3029, %f3020, %f3030; mov.f32 %f3032, 0fBE7FFF22; fma.rn.f32 %f3033, %f3031, %f3020, %f3032; mov.f32 %f3034, 0f3EAAAA78; fma.rn.f32 %f3035, %f3033, %f3020, %f3034; mov.f32 %f3036, 0fBF000000; fma.rn.f32 %f3037, %f3035, %f3020, %f3036; mul.f32 %f3038, %f3020, %f3037; fma.rn.f32 %f3039, %f3038, %f3020, %f3020; mov.f32 %f3040, 0f3F317218; fma.rn.f32 %f5525, %f3019, %f3040, %f3039; setp.lt.u32 %p445, %r1100, 2139095040; @%p445 bra $L__BB0_325; mov.f32 %f3041, 0f7F800000; fma.rn.f32 %f5525, %f391, %f3041, %f3041; $L__BB0_325: setp.eq.f32 %p446, %f391, 0f00000000; selp.f32 %f395, 0fFF800000, %f5525, %p446; mul.f32 %f3042, %f352, 0f4B000000; setp.lt.f32 %p447, %f352, 0f00800000; selp.f32 %f396, %f3042, %f352, %p447; selp.f32 %f3043, 0fC1B80000, 0f00000000, %p447; mov.b32 %r1104, %f396; add.s32 %r1105, %r1104, -1059760811; and.b32 %r1106, %r1105, -8388608; sub.s32 %r1107, %r1104, %r1106; mov.b32 %f3044, %r1107; cvt.rn.f32.s32 %f3045, %r1106; fma.rn.f32 %f3047, %f3045, %f3018, %f3043; add.f32 %f3048, %f3044, 0fBF800000; fma.rn.f32 %f3051, %f3022, %f3048, %f3021; fma.rn.f32 %f3053, %f3051, %f3048, %f3024; fma.rn.f32 %f3055, %f3053, %f3048, %f3026; fma.rn.f32 %f3057, %f3055, %f3048, %f3028; fma.rn.f32 %f3059, %f3057, %f3048, %f3030; fma.rn.f32 %f3061, %f3059, %f3048, %f3032; fma.rn.f32 %f3063, %f3061, %f3048, %f3034; fma.rn.f32 %f3065, %f3063, %f3048, %f3036; mul.f32 %f3066, %f3048, %f3065; fma.rn.f32 %f3067, %f3066, %f3048, %f3048; fma.rn.f32 %f5526, %f3047, %f3040, %f3067; setp.lt.u32 %p448, %r1104, 2139095040; @%p448 bra $L__BB0_327; mov.f32 %f3069, 0f7F800000; fma.rn.f32 %f5526, %f396, %f3069, %f3069; $L__BB0_327: mov.u64 %rd1272, 0; setp.eq.f32 %p449, %f396, 0f00000000; selp.f32 %f3070, 0fFF800000, %f5526, %p449; fma.rn.f32 %f400, %f390, 0f3F000000, %f395; fma.rn.f32 %f401, %f390, 0f3F000000, %f3070; add.f32 %f3071, %f400, 0f00000000; add.f32 %f402, %f3071, %f401; mul.f32 %f3072, %f402, 0f3F000000; sub.f32 %f403, %f400, %f3072; mov.b32 %r1108, %f403; sub.f32 %f404, %f401, %f3072; mov.b32 %r1109, %f404; cvt.u64.u32 %rd803, %r1109; cvt.u64.u32 %rd804, %r1108; bfi.b64 %rd805, %rd803, %rd804, 32, 32; add.u64 %rd1354, %SP, 0; cvta.to.local.u64 %rd1352, %rd1354; st.local.u64 [%rd1352], %rd805; cvta.to.local.u64 %rd1358, %rd1360; st.local.u64 [%rd1358], %rd1272; add.s64 %rd1351, %rd1352, 8; add.s64 %rd1364, %rd1358, 8; add.f32 %f3073, %f5524, %f5524; mul.f32 %f3074, %f3073, 0f3F5105EC; mov.f32 %f3075, 0f40400000; sub.f32 %f3076, %f3075, %f5524; div.rn.f32 %f405, %f3074, %f3076; mov.u64 %rd1365, 2; mov.u64 %rd1353, %rd1352; mov.u64 %rd1355, %rd1352; mov.u64 %rd1356, %rd1352; mov.u64 %rd1357, %rd1354; mov.u64 %rd1359, %rd1358; mov.u64 %rd1361, %rd1358; mov.u64 %rd1362, %rd1358; mov.u64 %rd1363, %rd1360; $L__BB0_328: setp.eq.s64 %p450, %rd1365, 0; @%p450 bra $L__BB0_335; add.s64 %rd1365, %rd1365, -1; add.s64 %rd807, %rd1352, 8; setp.eq.s64 %p451, %rd1355, %rd1351; selp.b64 %rd808, %rd807, %rd1355, %p451; add.s64 %rd809, %rd1353, 8; selp.b64 %rd810, %rd809, %rd1356, %p451; add.s64 %rd811, %rd1354, 8; selp.b64 %rd812, %rd811, %rd1357, %p451; setp.eq.s64 %p452, %rd1365, 0; add.s64 %rd813, %rd808, 4; add.s64 %rd814, %rd810, 4; add.s64 %rd815, %rd812, 4; selp.b64 %rd198, %rd808, %rd813, %p452; selp.b64 %rd1356, %rd810, %rd814, %p452; selp.b64 %rd1357, %rd812, %rd815, %p452; selp.b64 %rd1352, %rd807, %rd1352, %p451; selp.b64 %rd1353, %rd809, %rd1353, %p451; selp.b64 %rd1354, %rd811, %rd1354, %p451; add.s64 %rd816, %rd1355, 8; selp.b64 %rd1351, %rd816, %rd1351, %p451; add.s64 %rd817, %rd1361, 8; setp.eq.s64 %p453, %rd1358, %rd1364; selp.b64 %rd818, %rd817, %rd1358, %p453; add.s64 %rd819, %rd1362, 8; selp.b64 %rd820, %rd819, %rd1359, %p453; add.s64 %rd821, %rd1363, 8; selp.b64 %rd822, %rd821, %rd1360, %p453; selp.b64 %rd1361, %rd817, %rd1361, %p453; selp.b64 %rd1362, %rd819, %rd1362, %p453; selp.b64 %rd1363, %rd821, %rd1363, %p453; add.s64 %rd823, %rd1358, 8; selp.b64 %rd1364, %rd823, %rd1364, %p453; add.s64 %rd824, %rd818, 4; add.s64 %rd825, %rd820, 4; add.s64 %rd826, %rd822, 4; selp.b64 %rd1358, %rd818, %rd824, %p452; selp.b64 %rd1359, %rd820, %rd825, %p452; selp.b64 %rd1360, %rd822, %rd826, %p452; ld.local.f32 %f3077, [%rd820]; ld.local.f32 %f3078, [%rd810]; setp.eq.f32 %p454, %f3078, %f3077; mov.u64 %rd1355, %rd198; @%p454 bra $L__BB0_328; setp.gt.f32 %p455, %f402, 0f00000000; @%p455 bra $L__BB0_335; bra.uni $L__BB0_331; $L__BB0_335: mul.f32 %f3120, %f401, %f401; fma.rn.f32 %f3121, %f400, %f400, %f3120; add.f32 %f3122, %f3121, 0f00000000; sqrt.rn.f32 %f5527, %f3122; mov.u64 %rd1366, 4575657222473777152; $L__BB0_336: mov.f32 %f5447, 0f3F317218; mov.f32 %f5446, 0fBF000000; mov.f32 %f5445, 0f3EAAAA78; mov.f32 %f5444, 0fBE7FFF22; mov.f32 %f5443, 0f3E4CED0B; mov.f32 %f5442, 0fBE2AD8B9; mov.f32 %f5441, 0f3E0F2955; mov.f32 %f5440, 0fBDF8CDCC; mov.f32 %f5439, 0f3E1039F6; mov.f32 %f5438, 0fBE055027; mov.f32 %f5437, 0f34000000; mov.u64 %rd1273, 0; mov.b32 %r1116, %f5527; cvt.u64.u32 %rd831, %r1116; or.b64 %rd217, %rd831, %rd1273; shr.u64 %rd832, %rd1366, 32; cvt.u32.u64 %r1117, %rd832; cvt.u32.u64 %r1118, %rd1366; mov.b32 %f410, %r1118; mov.b32 %f411, %r1117; mul.f32 %f412, %f350, %f352; setp.lt.f32 %p457, %f412, 0f00800000; mul.f32 %f3123, %f412, 0f4B000000; selp.f32 %f413, %f3123, %f412, %p457; selp.f32 %f3124, 0fC1B80000, 0f00000000, %p457; mov.b32 %r1119, %f413; add.s32 %r1120, %r1119, -1059760811; and.b32 %r1121, %r1120, -8388608; sub.s32 %r1122, %r1119, %r1121; mov.b32 %f3125, %r1122; cvt.rn.f32.s32 %f3126, %r1121; fma.rn.f32 %f3128, %f3126, %f5437, %f3124; add.f32 %f3129, %f3125, 0fBF800000; fma.rn.f32 %f3132, %f5438, %f3129, %f5439; fma.rn.f32 %f3134, %f3132, %f3129, %f5440; fma.rn.f32 %f3136, %f3134, %f3129, %f5441; fma.rn.f32 %f3138, %f3136, %f3129, %f5442; fma.rn.f32 %f3140, %f3138, %f3129, %f5443; fma.rn.f32 %f3142, %f3140, %f3129, %f5444; fma.rn.f32 %f3144, %f3142, %f3129, %f5445; fma.rn.f32 %f3146, %f3144, %f3129, %f5446; mul.f32 %f3147, %f3129, %f3146; fma.rn.f32 %f3148, %f3147, %f3129, %f3129; fma.rn.f32 %f5528, %f3128, %f5447, %f3148; setp.lt.u32 %p458, %r1119, 2139095040; @%p458 bra $L__BB0_338; mov.f32 %f3150, 0f7F800000; fma.rn.f32 %f5528, %f413, %f3150, %f3150; $L__BB0_338: mov.f32 %f5458, 0f3F317218; mov.f32 %f5457, 0fBF000000; mov.f32 %f5456, 0f3EAAAA78; mov.f32 %f5455, 0fBE7FFF22; mov.f32 %f5454, 0f3E4CED0B; mov.f32 %f5453, 0fBE2AD8B9; mov.f32 %f5452, 0f3E0F2955; mov.f32 %f5451, 0fBDF8CDCC; mov.f32 %f5450, 0f3E1039F6; mov.f32 %f5449, 0fBE055027; mov.f32 %f5448, 0f34000000; mul.f32 %f3151, %f410, %f411; setp.eq.f32 %p459, %f413, 0f00000000; selp.f32 %f417, 0fFF800000, %f5528, %p459; mul.f32 %f3152, %f3151, 0f4B000000; setp.lt.f32 %p460, %f3151, 0f00800000; selp.f32 %f418, %f3152, %f3151, %p460; selp.f32 %f3153, 0fC1B80000, 0f00000000, %p460; mov.b32 %r1123, %f418; add.s32 %r1124, %r1123, -1059760811; and.b32 %r1125, %r1124, -8388608; sub.s32 %r1126, %r1123, %r1125; mov.b32 %f3154, %r1126; cvt.rn.f32.s32 %f3155, %r1125; fma.rn.f32 %f3157, %f3155, %f5448, %f3153; add.f32 %f3158, %f3154, 0fBF800000; fma.rn.f32 %f3161, %f5449, %f3158, %f5450; fma.rn.f32 %f3163, %f3161, %f3158, %f5451; fma.rn.f32 %f3165, %f3163, %f3158, %f5452; fma.rn.f32 %f3167, %f3165, %f3158, %f5453; fma.rn.f32 %f3169, %f3167, %f3158, %f5454; fma.rn.f32 %f3171, %f3169, %f3158, %f5455; fma.rn.f32 %f3173, %f3171, %f3158, %f5456; fma.rn.f32 %f3175, %f3173, %f3158, %f5457; mul.f32 %f3176, %f3158, %f3175; fma.rn.f32 %f3177, %f3176, %f3158, %f3158; fma.rn.f32 %f5529, %f3157, %f5458, %f3177; setp.lt.u32 %p461, %r1123, 2139095040; div.rn.f32 %f3179, %f412, %f3151; mul.f32 %f5532, %f5532, %f3179; @%p461 bra $L__BB0_340; mov.f32 %f3180, 0f7F800000; fma.rn.f32 %f5529, %f418, %f3180, %f3180; $L__BB0_340: add.u64 %rd1300, %SPL, 80; setp.eq.f32 %p462, %f418, 0f00000000; selp.f32 %f3181, 0fFF800000, %f5529, %p462; sub.f32 %f3182, %f417, %f3181; ld.f32 %f3183, [%rd156+8]; add.f32 %f3184, %f3183, %f3182; st.f32 [%rd156+8], %f3184; ld.f32 %f3185, [%rd156]; shl.b64 %rd834, %rd217, 32; or.b64 %rd835, %rd834, %rd832; mov.b64 {%r1127, %r1128}, %rd835; mov.b32 %f3186, %r1128; add.f32 %f3187, %f3186, %f3185; st.f32 [%rd156], %f3187; mov.b64 {%r1129, %r1130}, %rd168; mov.b64 {%r1131, %r1132}, %rd167; mov.b32 %f3188, %r1131; mul.f32 %f3189, %f3188, %f410; mov.b32 %f3190, %r1132; mul.f32 %f3191, %f3190, %f410; mov.b32 %f3192, %r1129; mul.f32 %f3193, %f3192, %f411; mov.b32 %f3194, %r1130; mul.f32 %f3195, %f3194, %f411; mov.b64 {%r1133, %r1134}, %rd170; mov.b64 {%r1135, %r1136}, %rd169; mov.b32 %f3196, %r1135; mov.b32 %f3197, %r1136; mul.f32 %f3198, %f3197, %f3193; mul.f32 %f3199, %f3197, %f3195; fma.rn.f32 %f5531, %f3196, %f3191, %f3199; mov.b32 %f3200, %r1133; mov.b32 %f3201, %r1134; mul.f32 %f3202, %f3201, %f3193; fma.rn.f32 %f5530, %f3200, %f3189, %f3202; mul.f32 %f3203, %f3201, %f3195; fma.rn.f32 %f3204, %f3200, %f3191, %f3203; fma.rn.f32 %f3205, %f3196, %f3189, %f3198; st.local.v4.f32 [%rd1300], {%f3205, %f5531, %f5530, %f3204}; bra.uni $L__BB0_341; $L__BB0_331: mul.f32 %f3079, %f404, %f404; fma.rn.f32 %f3080, %f403, %f403, %f3079; add.f32 %f3081, %f3080, 0f00000000; sqrt.rn.f32 %f406, %f3081; ld.global.f32 %f3082, [%rd67+56]; ld.global.f32 %f3083, [%rd67+60]; add.f32 %f3084, %f3083, %f3083; fma.rn.f32 %f3085, %f3082, 0f40000000, %f3084; div.rn.f32 %f3086, %f3085, %f3084; mul.f32 %f3087, %f402, %f3086; fma.rn.f32 %f5527, %f405, %f3087, %f406; setp.gtu.f32 %p456, %f5527, 0f00000000; @%p456 bra $L__BB0_333; bra.uni $L__BB0_341; $L__BB0_333: mov.f32 %f5436, 0f32A57060; mov.f32 %f5435, 0f4B400001; mov.f32 %f5434, 0f437C0000; mov.f32 %f5433, 0f3F000000; mov.f32 %f5432, 0f3BBB989D; mov.f32 %f5394, 0f3FB8AA3B; div.rn.f32 %f3088, %f403, %f406; mul.f32 %f3089, %f5527, %f3088; div.rn.f32 %f3090, %f404, %f406; mul.f32 %f3091, %f5527, %f3090; sub.f32 %f3092, %f400, %f3089; sub.f32 %f3093, %f401, %f3091; fma.rn.f32 %f3096, %f3092, %f5432, %f5433; cvt.sat.f32.f32 %f3099, %f3096; fma.rm.f32 %f3101, %f3099, %f5434, %f5435; add.f32 %f3102, %f3101, 0fCB40007F; neg.f32 %f3103, %f3102; fma.rn.f32 %f3104, %f3092, %f5394, %f3103; fma.rn.f32 %f3106, %f3092, %f5436, %f3104; mov.b32 %r1110, %f3101; shl.b32 %r1111, %r1110, 23; mov.b32 %f3107, %r1111; ex2.approx.ftz.f32 %f3108, %f3106; mul.f32 %f3109, %f3108, %f3107; fma.rn.f32 %f3110, %f3093, %f5432, %f5433; cvt.sat.f32.f32 %f3111, %f3110; fma.rm.f32 %f3112, %f3111, %f5434, %f5435; add.f32 %f3113, %f3112, 0fCB40007F; neg.f32 %f3114, %f3113; fma.rn.f32 %f3115, %f3093, %f5394, %f3114; fma.rn.f32 %f3116, %f3093, %f5436, %f3115; mov.b32 %r1112, %f3112; shl.b32 %r1113, %r1112, 23; mov.b32 %f3117, %r1113; ex2.approx.ftz.f32 %f3118, %f3116; mul.f32 %f3119, %f3118, %f3117; mov.b32 %r1114, %f3109; mov.b32 %r1115, %f3119; cvt.u64.u32 %rd827, %r1115; cvt.u64.u32 %rd828, %r1114; bfi.b64 %rd1366, %rd827, %rd828, 32, 32; bra.uni $L__BB0_336; $L__BB0_435: mov.b32 %r224, %f596; bfe.u32 %r1269, %r224, 23, 8; add.s32 %r225, %r1269, -128; shl.b32 %r1270, %r224, 8; or.b32 %r226, %r1270, -2147483648; shr.u32 %r227, %r225, 5; add.u64 %rd882, %SP, 32; add.u64 %rd1372, %SPL, 32; mov.u32 %r1772, 0; mov.u64 %rd1371, __cudart_i2opi_f; mov.u32 %r1773, %r1772; $L__BB0_436: .pragma "nounroll"; mov.u32 %r229, %r1773; ld.global.nc.u32 %r1273, [%rd1371]; // begin inline asm { mad.lo.cc.u32 %r1271, %r1273, %r226, %r229; madc.hi.u32 %r1773, %r1273, %r226, 0; } // end inline asm st.local.u32 [%rd1372], %r1271; add.s64 %rd1372, %rd1372, 4; add.s64 %rd1371, %rd1371, 4; add.s32 %r1772, %r1772, 1; setp.ne.s32 %p589, %r1772, 6; @%p589 bra $L__BB0_436; mov.u32 %r1278, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1276, %r1278, %r226, %r229; madc.hi.u32 %r1277, %r1278, %r226, 0; } // end inline asm cvta.to.local.u64 %rd884, %rd882; st.local.u32 [%rd884+24], %r1277; mov.u32 %r1281, 4; sub.s32 %r232, %r1281, %r227; mov.u32 %r1282, 6; sub.s32 %r1283, %r1282, %r227; mul.wide.s32 %rd885, %r1283, 4; add.s64 %rd886, %rd884, %rd885; ld.local.u32 %r1774, [%rd886]; ld.local.u32 %r1775, [%rd886+-4]; and.b32 %r235, %r225, 31; setp.eq.s32 %p590, %r235, 0; @%p590 bra $L__BB0_439; mov.u32 %r1284, 32; sub.s32 %r1285, %r1284, %r235; shr.u32 %r1286, %r1775, %r1285; shl.b32 %r1287, %r1774, %r235; add.s32 %r1774, %r1286, %r1287; mul.wide.s32 %rd889, %r232, 4; add.s64 %rd890, %rd884, %rd889; ld.local.u32 %r1288, [%rd890]; shr.u32 %r1289, %r1288, %r1285; shl.b32 %r1290, %r1775, %r235; add.s32 %r1775, %r1289, %r1290; $L__BB0_439: and.b32 %r1291, %r224, -2147483648; shr.u32 %r1292, %r1775, 30; shl.b32 %r1293, %r1774, 2; or.b32 %r1294, %r1292, %r1293; shr.u32 %r1295, %r1294, 31; shr.u32 %r1296, %r1774, 30; add.s32 %r1297, %r1295, %r1296; neg.s32 %r1298, %r1297; setp.eq.s32 %p591, %r1291, 0; selp.b32 %r1776, %r1297, %r1298, %p591; setp.ne.s32 %p592, %r1295, 0; xor.b32 %r1299, %r1291, -2147483648; selp.b32 %r1300, %r1299, %r1291, %p592; selp.b32 %r1301, -1, 0, %p592; xor.b32 %r1302, %r1294, %r1301; shl.b32 %r1303, %r1775, 2; xor.b32 %r1304, %r1303, %r1301; cvt.u64.u32 %rd891, %r1302; cvt.u64.u32 %rd892, %r1304; bfi.b64 %rd893, %rd891, %rd892, 32, 32; cvt.rn.f64.s64 %fd19, %rd893; mul.f64 %fd20, %fd19, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f3777, %fd20; setp.eq.s32 %p593, %r1300, 0; neg.f32 %f3778, %f3777; selp.f32 %f5568, %f3777, %f3778, %p593; $L__BB0_441: mul.f32 %f3780, %f597, 0f3F22F983; cvt.rni.s32.f32 %r1781, %f3780; cvt.rn.f32.s32 %f3781, %r1781; fma.rn.f32 %f3783, %f3781, %f3772, %f597; fma.rn.f32 %f3785, %f3781, %f3774, %f3783; fma.rn.f32 %f5569, %f3781, %f3776, %f3785; abs.f32 %f604, %f597; setp.leu.f32 %p594, %f604, 0f47CE4780; @%p594 bra $L__BB0_449; setp.eq.f32 %p595, %f604, 0f7F800000; @%p595 bra $L__BB0_448; bra.uni $L__BB0_443; $L__BB0_448: mov.f32 %f3789, 0f00000000; mul.rn.f32 %f5569, %f597, %f3789; bra.uni $L__BB0_449; $L__BB0_443: mov.b32 %r243, %f597; bfe.u32 %r1307, %r243, 23, 8; add.s32 %r244, %r1307, -128; shl.b32 %r1308, %r243, 8; or.b32 %r245, %r1308, -2147483648; shr.u32 %r246, %r244, 5; add.u64 %rd895, %SP, 32; add.u64 %rd1374, %SPL, 32; mov.u32 %r1777, 0; mov.u64 %rd1373, __cudart_i2opi_f; mov.u32 %r1778, %r1777; $L__BB0_444: .pragma "nounroll"; mov.u32 %r248, %r1778; ld.global.nc.u32 %r1311, [%rd1373]; // begin inline asm { mad.lo.cc.u32 %r1309, %r1311, %r245, %r248; madc.hi.u32 %r1778, %r1311, %r245, 0; } // end inline asm st.local.u32 [%rd1374], %r1309; add.s64 %rd1374, %rd1374, 4; add.s64 %rd1373, %rd1373, 4; add.s32 %r1777, %r1777, 1; setp.ne.s32 %p596, %r1777, 6; @%p596 bra $L__BB0_444; mov.u32 %r1316, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1314, %r1316, %r245, %r248; madc.hi.u32 %r1315, %r1316, %r245, 0; } // end inline asm cvta.to.local.u64 %rd897, %rd895; st.local.u32 [%rd897+24], %r1315; mov.u32 %r1319, 4; sub.s32 %r251, %r1319, %r246; mov.u32 %r1320, 6; sub.s32 %r1321, %r1320, %r246; mul.wide.s32 %rd898, %r1321, 4; add.s64 %rd899, %rd897, %rd898; ld.local.u32 %r1779, [%rd899]; ld.local.u32 %r1780, [%rd899+-4]; and.b32 %r254, %r244, 31; setp.eq.s32 %p597, %r254, 0; @%p597 bra $L__BB0_447; mov.u32 %r1322, 32; sub.s32 %r1323, %r1322, %r254; shr.u32 %r1324, %r1780, %r1323; shl.b32 %r1325, %r1779, %r254; add.s32 %r1779, %r1324, %r1325; mul.wide.s32 %rd902, %r251, 4; add.s64 %rd903, %rd897, %rd902; ld.local.u32 %r1326, [%rd903]; shr.u32 %r1327, %r1326, %r1323; shl.b32 %r1328, %r1780, %r254; add.s32 %r1780, %r1327, %r1328; $L__BB0_447: and.b32 %r1329, %r243, -2147483648; shr.u32 %r1330, %r1780, 30; shl.b32 %r1331, %r1779, 2; or.b32 %r1332, %r1330, %r1331; shr.u32 %r1333, %r1332, 31; shr.u32 %r1334, %r1779, 30; add.s32 %r1335, %r1333, %r1334; neg.s32 %r1336, %r1335; setp.eq.s32 %p598, %r1329, 0; selp.b32 %r1781, %r1335, %r1336, %p598; setp.ne.s32 %p599, %r1333, 0; xor.b32 %r1337, %r1329, -2147483648; selp.b32 %r1338, %r1337, %r1329, %p599; selp.b32 %r1339, -1, 0, %p599; xor.b32 %r1340, %r1332, %r1339; shl.b32 %r1341, %r1780, 2; xor.b32 %r1342, %r1341, %r1339; cvt.u64.u32 %rd904, %r1340; cvt.u64.u32 %rd905, %r1342; bfi.b64 %rd906, %rd904, %rd905, 32, 32; cvt.rn.f64.s64 %fd21, %rd906; mul.f64 %fd22, %fd21, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f3787, %fd22; setp.eq.s32 %p600, %r1338, 0; neg.f32 %f3788, %f3787; selp.f32 %f5569, %f3787, %f3788, %p600; $L__BB0_449: mov.f32 %f5198, 0f3F800000; setp.lt.f32 %p601, %f583, 0f00000000; mov.f32 %f3790, 0f00000000; selp.f32 %f3791, 0fBF800000, 0f3F800000, %p601; mul.f32 %f3793, %f5568, %f5568; mov.f32 %f3794, 0fBAB607ED; mov.f32 %f3795, 0f37CBAC00; fma.rn.f32 %f3796, %f3795, %f3793, %f3794; mov.f32 %f3797, 0f3D2AAABB; fma.rn.f32 %f3798, %f3796, %f3793, %f3797; mov.f32 %f3799, 0fBEFFFFFF; fma.rn.f32 %f3800, %f3798, %f3793, %f3799; fma.rn.f32 %f3801, %f3800, %f3793, %f5198; mov.f32 %f3802, 0f3C0885E4; mov.f32 %f3803, 0fB94D4153; fma.rn.f32 %f3804, %f3803, %f3793, %f3802; mov.f32 %f3805, 0fBE2AAAA8; fma.rn.f32 %f3806, %f3804, %f3793, %f3805; fma.rn.f32 %f3807, %f3793, %f5568, %f3790; fma.rn.f32 %f3808, %f3806, %f3807, %f5568; and.b32 %r1343, %r1776, 1; setp.eq.b32 %p602, %r1343, 1; selp.f32 %f3809, %f3801, %f3808, %p602; selp.f32 %f3810, %f3808, %f3801, %p602; neg.f32 %f3811, %f3809; and.b32 %r1344, %r1776, 2; setp.eq.s32 %p603, %r1344, 0; selp.f32 %f3812, %f3809, %f3811, %p603; neg.f32 %f3813, %f3810; add.s32 %r1345, %r1776, 1; and.b32 %r1346, %r1345, 2; setp.eq.s32 %p604, %r1346, 0; selp.f32 %f3814, %f3810, %f3813, %p604; mul.f32 %f3815, %f5569, %f5569; fma.rn.f32 %f3816, %f3795, %f3815, %f3794; fma.rn.f32 %f3817, %f3816, %f3815, %f3797; fma.rn.f32 %f3818, %f3817, %f3815, %f3799; fma.rn.f32 %f3819, %f3818, %f3815, %f5198; fma.rn.f32 %f3820, %f3815, %f5569, %f3790; fma.rn.f32 %f3821, %f3803, %f3815, %f3802; fma.rn.f32 %f3822, %f3821, %f3815, %f3805; fma.rn.f32 %f3823, %f3822, %f3820, %f5569; and.b32 %r1347, %r1781, 1; setp.eq.b32 %p605, %r1347, 1; selp.f32 %f3824, %f3819, %f3823, %p605; selp.f32 %f3825, %f3823, %f3819, %p605; and.b32 %r1348, %r1781, 2; setp.eq.s32 %p606, %r1348, 0; neg.f32 %f3826, %f3824; selp.f32 %f3827, %f3824, %f3826, %p606; add.s32 %r1349, %r1781, 1; and.b32 %r1350, %r1349, 2; setp.eq.s32 %p607, %r1350, 0; neg.f32 %f3828, %f3825; selp.f32 %f3829, %f3825, %f3828, %p607; mov.b32 %r1351, %f3829; neg.f32 %f3830, %f3827; mov.b32 %r1352, %f3827; cvt.u64.u32 %rd907, %r1352; mov.b32 %r1353, %f3830; cvt.u64.u32 %rd908, %r1353; cvt.u64.u32 %rd909, %r1351; bfi.b64 %rd910, %rd909, %rd908, 32, 32; mov.b64 {%r1354, %r1355}, %rd910; bfi.b64 %rd911, %rd907, %rd909, 32, 32; mov.b64 {%r1356, %r1357}, %rd911; mul.f32 %f3831, %f3791, %f3812; mov.b32 %r1358, %f3831; cvt.u64.u32 %rd912, %r1358; mov.b32 %r1359, %f3814; cvt.u64.u32 %rd913, %r1359; neg.f32 %f3832, %f3812; mov.b32 %r1360, %f3832; mul.f32 %f3833, %f3791, %f3814; mov.b32 %r1361, %f3833; cvt.u64.u32 %rd914, %r1361; cvt.u64.u32 %rd915, %r1360; bfi.b64 %rd916, %rd914, %rd915, 32, 32; mov.b64 {%r1362, %r1363}, %rd916; bfi.b64 %rd917, %rd912, %rd913, 32, 32; mov.b64 {%r1364, %r1365}, %rd917; add.f32 %f608, %f582, 0fBF800000; fma.rn.f32 %f609, %f583, %f3791, 0fBF800000; mov.b32 %f610, %r1356; mov.b32 %f611, %r1357; mov.b32 %f612, %r1354; mov.b32 %f613, %r1355; mov.b32 %f614, %r1364; mov.b32 %f615, %r1365; mov.b32 %f616, %r1362; mov.b32 %f617, %r1363; add.f32 %f618, %f578, 0fBF800000; setp.eq.f32 %p608, %f496, 0f3F800000; @%p608 bra $L__BB0_454; bra.uni $L__BB0_450; $L__BB0_454: ld.global.f32 %f3899, [%rd67+20]; add.f32 %f3900, %f3899, %f3899; mul.f32 %f3901, %f577, %f3900; mul.f32 %f3902, %f608, %f610; mul.f32 %f3903, %f608, %f611; mul.f32 %f3904, %f609, %f612; mul.f32 %f3905, %f615, %f3904; fma.rn.f32 %f3906, %f614, %f3902, %f3905; mul.f32 %f3907, %f609, %f613; mul.f32 %f3908, %f615, %f3907; fma.rn.f32 %f3909, %f614, %f3903, %f3908; mul.f32 %f3910, %f617, %f3904; fma.rn.f32 %f3911, %f616, %f3902, %f3910; mul.f32 %f3912, %f617, %f3907; fma.rn.f32 %f3913, %f616, %f3903, %f3912; mul.f32 %f3914, %f3906, %f3901; mul.f32 %f3915, %f3909, %f3901; mul.f32 %f3916, %f3911, %f3901; mul.f32 %f3917, %f3913, %f3901; mul.f32 %f3918, %f5530, %f3916; fma.rn.f32 %f3919, %f5546, %f3914, %f3918; mul.f32 %f3920, %f5530, %f3917; fma.rn.f32 %f3921, %f5546, %f3915, %f3920; mul.f32 %f3922, %f3916, %f5635; fma.rn.f32 %f3923, %f3914, %f5531, %f3922; mul.f32 %f3924, %f3917, %f5635; fma.rn.f32 %f3925, %f3915, %f5531, %f3924; ld.global.f32 %f3926, [%rd67+16]; mul.f32 %f3927, %f577, %f3926; mul.f32 %f3928, %f618, %f3927; mul.f32 %f3929, %f578, %f3928; add.u64 %rd922, %SPL, 32; mov.u64 %rd923, 0; st.local.v2.u64 [%rd922], {%rd923, %rd923}; mov.u32 %r1371, 1065353216; st.local.u32 [%rd922], %r1371; st.local.u32 [%rd922+12], %r1371; ld.local.v4.f32 {%f3930, %f3931, %f3932, %f3933}, [%rd922]; fma.rn.f32 %f3938, %f3929, %f3931, %f3921; mov.b32 %r1372, %f3938; fma.rn.f32 %f3939, %f3929, %f3930, %f3919; mov.b32 %r1373, %f3939; fma.rn.f32 %f3940, %f3929, %f3933, %f3925; mov.b32 %r1374, %f3940; fma.rn.f32 %f3941, %f3929, %f3932, %f3923; mov.b32 %r1375, %f3941; st.local.v4.f32 [%rd228], {%f3939, %f3938, %f3941, %f3940}; mov.b64 %rd1376, {%r1375, %r1374}; mov.b64 %rd1375, {%r1373, %r1372}; bra.uni $L__BB0_455; $L__BB0_450: ld.global.f32 %f3834, [%rd67+20]; add.f32 %f3835, %f3834, %f3834; mul.f32 %f3836, %f577, %f3835; max.f32 %f3838, %f608, %f3790; mul.f32 %f3839, %f610, %f3838; mul.f32 %f3840, %f611, %f3838; max.f32 %f3841, %f609, %f3790; mul.f32 %f3842, %f612, %f3841; mul.f32 %f3843, %f613, %f3841; mul.f32 %f3844, %f615, %f3842; fma.rn.f32 %f3845, %f614, %f3839, %f3844; mul.f32 %f3846, %f615, %f3843; fma.rn.f32 %f3847, %f614, %f3840, %f3846; mul.f32 %f3848, %f617, %f3842; fma.rn.f32 %f3849, %f616, %f3839, %f3848; mul.f32 %f3850, %f617, %f3843; fma.rn.f32 %f3851, %f616, %f3840, %f3850; mul.f32 %f3852, %f3845, %f3836; mul.f32 %f3853, %f3847, %f3836; mul.f32 %f3854, %f3849, %f3836; mul.f32 %f3855, %f3851, %f3836; mul.f32 %f3856, %f5530, %f3854; fma.rn.f32 %f5570, %f5546, %f3852, %f3856; mul.f32 %f3857, %f5530, %f3855; fma.rn.f32 %f5571, %f5546, %f3853, %f3857; mul.f32 %f3858, %f3854, %f5635; fma.rn.f32 %f5572, %f3852, %f5531, %f3858; mul.f32 %f3859, %f3855, %f5635; fma.rn.f32 %f5573, %f3853, %f5531, %f3859; min.f32 %f3860, %f608, %f3790; mul.f32 %f3861, %f610, %f3860; mul.f32 %f3862, %f611, %f3860; min.f32 %f3863, %f609, %f3790; mul.f32 %f3864, %f612, %f3863; mul.f32 %f3865, %f613, %f3863; mul.f32 %f3866, %f615, %f3864; fma.rn.f32 %f3867, %f614, %f3861, %f3866; mul.f32 %f3868, %f615, %f3865; fma.rn.f32 %f3869, %f614, %f3862, %f3868; mul.f32 %f3870, %f617, %f3864; fma.rn.f32 %f3871, %f616, %f3861, %f3870; mul.f32 %f3872, %f617, %f3865; fma.rn.f32 %f3873, %f616, %f3862, %f3872; mul.f32 %f3874, %f3836, %f3867; mul.f32 %f3875, %f3836, %f3869; mul.f32 %f3876, %f3836, %f3871; mul.f32 %f3877, %f3836, %f3873; mul.f32 %f3878, %f5530, %f3876; fma.rn.f32 %f5574, %f5546, %f3874, %f3878; mul.f32 %f3879, %f5530, %f3877; fma.rn.f32 %f5575, %f5546, %f3875, %f3879; mul.f32 %f3880, %f3876, %f5635; fma.rn.f32 %f5576, %f3874, %f5531, %f3880; mul.f32 %f3881, %f3877, %f5635; fma.rn.f32 %f5577, %f3875, %f5531, %f3881; ld.global.f32 %f3882, [%rd67+16]; mul.f32 %f3883, %f577, %f3882; mul.f32 %f3884, %f618, %f3883; mul.f32 %f3885, %f578, %f3884; add.u64 %rd919, %SPL, 32; st.local.v4.f32 [%rd919], {%f3790, %f3790, %f3790, %f3790}; mov.u64 %rd920, 0; st.local.v2.u64 [%rd228], {%rd920, %rd920}; mov.u32 %r1366, 1065353216; st.local.u32 [%rd228], %r1366; st.local.u32 [%rd228+12], %r1366; ld.local.v4.f32 {%f3886, %f3887, %f3888, %f3889}, [%rd228]; mul.f32 %f627, %f3885, %f3886; mul.f32 %f628, %f3885, %f3887; mul.f32 %f629, %f3885, %f3888; mul.f32 %f630, %f3885, %f3889; setp.lt.f32 %p609, %f578, 0f3F800000; @%p609 bra $L__BB0_452; bra.uni $L__BB0_451; $L__BB0_452: add.f32 %f5574, %f5574, %f627; add.f32 %f5575, %f5575, %f628; add.f32 %f5576, %f5576, %f629; add.f32 %f5577, %f5577, %f630; bra.uni $L__BB0_453; $L__BB0_451: add.f32 %f5570, %f5570, %f627; add.f32 %f5571, %f5571, %f628; add.f32 %f5572, %f5572, %f629; add.f32 %f5573, %f5573, %f630; $L__BB0_453: ld.global.u8 %rs30, [%rd67+8]; setp.ne.s16 %p610, %rs30, 0; setp.eq.f32 %p611, %f496, 0f00000000; and.pred %p612, %p611, %p610; selp.f32 %f3894, 0f00000000, 0f3F800000, %p612; fma.rn.f32 %f3895, %f5571, %f3894, %f5575; mov.b32 %r1367, %f3895; fma.rn.f32 %f3896, %f5570, %f3894, %f5574; mov.b32 %r1368, %f3896; fma.rn.f32 %f3897, %f5573, %f3894, %f5577; mov.b32 %r1369, %f3897; fma.rn.f32 %f3898, %f5572, %f3894, %f5576; mov.b32 %r1370, %f3898; mov.b64 %rd1376, {%r1370, %r1369}; mov.b64 %rd1375, {%r1368, %r1367}; bra.uni $L__BB0_455; $L__BB0_258: abs.f32 %f5235, %f307; setp.neu.f32 %p365, %f5235, 0f7F800000; @%p365 bra $L__BB0_261; selp.f32 %f5509, 0fFF800000, 0f7F800000, %p9; $L__BB0_261: ld.global.u8 %rs23, [%rd67+48]; setp.eq.s16 %p366, %rs23, 0; @%p366 bra $L__BB0_265; mov.f32 %f5222, 0fBF000000; div.rn.f32 %f2638, %f255, %f307; setp.lt.f32 %p367, %f2638, 0f00800000; mul.f32 %f2639, %f2638, 0f4B000000; selp.f32 %f320, %f2639, %f2638, %p367; selp.f32 %f2640, 0fC1B80000, 0f00000000, %p367; mov.b32 %r902, %f320; add.s32 %r903, %r902, -1059760811; and.b32 %r904, %r903, -8388608; sub.s32 %r905, %r902, %r904; mov.b32 %f2641, %r905; cvt.rn.f32.s32 %f2642, %r904; mov.f32 %f2643, 0f34000000; fma.rn.f32 %f2644, %f2642, %f2643, %f2640; add.f32 %f2645, %f2641, 0fBF800000; mov.f32 %f2646, 0f3E1039F6; mov.f32 %f2647, 0fBE055027; fma.rn.f32 %f2648, %f2647, %f2645, %f2646; mov.f32 %f2649, 0fBDF8CDCC; fma.rn.f32 %f2650, %f2648, %f2645, %f2649; mov.f32 %f2651, 0f3E0F2955; fma.rn.f32 %f2652, %f2650, %f2645, %f2651; mov.f32 %f2653, 0fBE2AD8B9; fma.rn.f32 %f2654, %f2652, %f2645, %f2653; mov.f32 %f2655, 0f3E4CED0B; fma.rn.f32 %f2656, %f2654, %f2645, %f2655; mov.f32 %f2657, 0fBE7FFF22; fma.rn.f32 %f2658, %f2656, %f2645, %f2657; mov.f32 %f2659, 0f3EAAAA78; fma.rn.f32 %f2660, %f2658, %f2645, %f2659; fma.rn.f32 %f2662, %f2660, %f2645, %f5222; mul.f32 %f2663, %f2645, %f2662; fma.rn.f32 %f2664, %f2663, %f2645, %f2645; mov.f32 %f2665, 0f3F317218; fma.rn.f32 %f5510, %f2644, %f2665, %f2664; setp.lt.u32 %p368, %r902, 2139095040; @%p368 bra $L__BB0_264; mov.f32 %f2666, 0f7F800000; fma.rn.f32 %f5510, %f320, %f2666, %f2666; $L__BB0_264: setp.eq.f32 %p369, %f320, 0f00000000; selp.f32 %f2667, 0fFF800000, %f5510, %p369; add.f32 %f5517, %f5517, %f2667; $L__BB0_265: setp.eq.f32 %p370, %f307, 0f3F800000; selp.f32 %f2668, 0f3F800000, %f5509, %p370; mov.b64 {%r906, %r907}, %rd145; mov.b64 {%r908, %r909}, %rd144; mov.b32 %f2669, %r908; mul.f32 %f2670, %f2669, %f2668; mov.b32 %f2671, %r909; mul.f32 %f2672, %f2671, %f2668; mov.b32 %f2673, %r906; mul.f32 %f2674, %f2673, %f2668; mov.b32 %f2675, %r907; mul.f32 %f2676, %f2675, %f2668; mov.b64 {%r910, %r911}, %rd147; mov.b64 {%r912, %r913}, %rd146; mov.b32 %f2677, %r912; mov.b32 %f2678, %r913; mul.f32 %f2679, %f2678, %f2674; mul.f32 %f2680, %f2678, %f2676; mov.b32 %f2681, %r910; mov.b32 %f2682, %r911; mul.f32 %f2683, %f2682, %f2674; mul.f32 %f2684, %f2682, %f2676; fma.rn.f32 %f2685, %f2677, %f2672, %f2680; mov.b32 %r914, %f2685; fma.rn.f32 %f2686, %f2677, %f2670, %f2679; mov.b32 %r915, %f2686; fma.rn.f32 %f2687, %f2681, %f2672, %f2684; mov.b32 %r916, %f2687; fma.rn.f32 %f2688, %f2681, %f2670, %f2683; mov.b32 %r917, %f2688; mov.b64 %rd1344, {%r917, %r916}; mov.b64 %rd1343, {%r915, %r914}; bra.uni $L__BB0_283; $L__BB0_401: abs.f32 %f5312, %f498; setp.neu.f32 %p542, %f5312, 0f7F800000; @%p542 bra $L__BB0_405; setp.gt.s32 %p543, %r218, -1; selp.b32 %r1219, 2139095040, 0, %p543; or.b32 %r1220, %r1219, -2147483648; selp.b32 %r1221, %r1220, %r1219, %p12; mov.b32 %f5550, %r1221; $L__BB0_405: add.u64 %rd1276, %SPL, 0; mov.f32 %f5240, 0f00000000; setp.eq.s32 %p547, %r218, 0; setp.eq.f32 %p548, %f498, 0f3F800000; mov.u32 %r1225, 1065353216; or.pred %p549, %p548, %p547; add.f32 %f3534, %f5550, 0fBF800000; selp.f32 %f3535, 0f00000000, %f3534, %p549; mul.f32 %f3536, %f497, %f3535; ld.global.f32 %f3537, [%rd67+20]; neg.f32 %f3538, %f3537; max.f32 %f3539, %f3536, %f3538; mul.f32 %f3540, %f5532, %f3539; neg.f32 %f3541, %f3540; add.u64 %rd864, %SPL, 32; st.local.v4.f32 [%rd864], {%f5240, %f5240, %f5240, %f5240}; mov.u64 %rd865, 0; st.local.v2.u64 [%rd1276], {%rd865, %rd865}; st.local.u32 [%rd1276], %r1225; st.local.u32 [%rd1276+12], %r1225; ld.local.v4.f32 {%f3543, %f3544, %f3545, %f3546}, [%rd1276]; mul.f32 %f5551, %f3543, %f3541; mul.f32 %f5552, %f3544, %f3541; mul.f32 %f5553, %f3545, %f3541; mul.f32 %f5554, %f3546, %f3541; ld.global.f32 %f522, [%rd67+16]; setp.eq.f32 %p550, %f522, 0f00000000; @%p550 bra $L__BB0_407; add.f32 %f3551, %f5533, %f5533; add.f32 %f3552, %f5535, %f5534; add.f32 %f3553, %f5536, %f5536; mul.f32 %f3554, %f3552, 0f3F000000; mul.f32 %f3555, %f3553, 0f3F000000; mul.f32 %f3556, %f3551, 0f3F000000; add.f32 %f3557, %f3556, 0f00000000; add.f32 %f3558, %f3555, %f3557; mul.f32 %f3559, %f3558, 0f3F000000; st.local.v4.f32 [%rd864], {%f3556, %f3554, %f3554, %f3555}; sub.f32 %f3560, %f3556, %f3559; st.local.f32 [%rd864], %f3560; sub.f32 %f3561, %f3555, %f3559; st.local.f32 [%rd864+12], %f3561; ld.local.v4.f32 {%f3562, %f3563, %f3564, %f3565}, [%rd864]; add.f32 %f3566, %f522, %f522; mul.f32 %f3567, %f5532, %f3566; fma.rn.f32 %f5551, %f3567, %f3562, %f5551; fma.rn.f32 %f5552, %f3567, %f3563, %f5552; fma.rn.f32 %f5553, %f3567, %f3564, %f5553; fma.rn.f32 %f5554, %f3567, %f3565, %f5554; $L__BB0_407: mov.b32 %r1226, %f5551; mov.b32 %r1227, %f5552; mov.b64 %rd1375, {%r1226, %r1227}; mov.b32 %r1228, %f5553; mov.b32 %r1229, %f5554; mov.b64 %rd1376, {%r1228, %r1229}; $L__BB0_455: setp.eq.s32 %p614, %r217, 1; mov.pred %p838, 0; @%p614 bra $L__BB0_467; mov.b64 {%r1376, %r1377}, %rd1376; mov.b64 {%r1378, %r1379}, %rd1375; mov.b32 %f656, %r1378; abs.f32 %f3942, %f656; mov.b32 %f5580, %r1379; abs.f32 %f3943, %f5580; setp.le.f32 %p615, %f3943, %f3942; selp.f32 %f3944, %f3942, %f3943, %p615; mov.b32 %f658, %r1376; abs.f32 %f3945, %f658; setp.le.f32 %p616, %f3945, %f3944; selp.f32 %f3946, %f3944, %f3945, %p616; mov.b32 %f5581, %r1377; abs.f32 %f3947, %f5581; setp.le.f32 %p617, %f3947, %f3946; selp.f32 %f660, %f3946, %f3947, %p617; setp.eq.f32 %p618, %f660, 0f00000000; @%p618 bra $L__BB0_458; div.rn.f32 %f5580, %f5580, %f660; div.rn.f32 %f5581, %f5581, %f660; mov.b32 %r1380, %f5580; div.rn.f32 %f3948, %f656, %f660; mov.b32 %r1381, %f3948; mov.b64 %rd1375, {%r1381, %r1380}; $L__BB0_458: fma.rn.f32 %f3950, %f5580, %f5580, 0f00000000; sqrt.rn.f32 %f3951, %f3950; setp.ltu.f32 %p619, %f5580, 0f00000000; selp.f32 %f3952, 0fBF800000, 0f3F800000, %p619; neg.f32 %f3953, %f5580; selp.f32 %f3954, %f3953, %f5580, %p619; mul.f32 %f5582, %f3952, %f3951; fma.rn.f32 %f3955, %f3954, %f3951, %f3950; add.f32 %f666, %f3955, %f3955; setp.eq.f32 %p620, %f666, 0f00000000; @%p620 bra $L__BB0_460; add.f32 %f3956, %f5580, %f5582; sqrt.rn.f32 %f3957, %f666; div.rn.f32 %f3958, %f3956, %f3957; neg.f32 %f5582, %f5582; add.f32 %f3959, %f3958, %f3958; fma.rn.f32 %f3960, %f5581, %f3959, 0f00000000; mul.f32 %f3961, %f3958, %f3960; add.f32 %f3962, %f3961, 0f00000000; sub.f32 %f3963, %f5581, %f3961; sub.f32 %f3964, %f3963, %f3961; add.f32 %f3965, %f3962, %f3962; mul.f32 %f3966, %f3958, %f3965; fma.rn.f32 %f5581, %f3958, %f3966, %f3964; $L__BB0_460: abs.f32 %f671, %f5582; mov.b64 {%r261, %r1384}, %rd1375; mov.b32 %f5584, %r261; abs.f32 %f3967, %f671; abs.f32 %f3968, %f5584; abs.f32 %f3969, %f5581; add.f32 %f3970, %f3969, %f3968; mul.f32 %f3971, %f3970, 0f358637BD; setp.leu.f32 %p621, %f3967, %f3971; @%p621 bra $L__BB0_466; mov.b32 %r1385, %f671; cvt.u64.u32 %rd928, %r261; cvt.u64.u32 %rd929, %r1385; mov.b32 %r1386, %f5581; cvt.u64.u32 %rd930, %r1386; mov.u64 %rd927, 0; bfi.b64 %rd931, %rd930, %rd929, 32, 32; mov.b64 {%r1387, %r1388}, %rd931; bfi.b64 %rd932, %rd929, %rd928, 32, 32; mov.b64 {%r1389, %r1390}, %rd932; mov.b32 %f673, %r1389; mov.b32 %f3972, %r1390; mov.b32 %f3973, %r1387; mov.b32 %f674, %r1388; sub.f32 %f3974, %f673, %f674; mul.f32 %f3975, %f3974, 0f3F000000; mul.f32 %f3976, %f3975, %f3975; fma.rn.f32 %f675, %f3972, %f3973, %f3976; setp.ltu.f32 %p622, %f675, 0f00000000; mov.u64 %rd1378, %rd927; mov.u64 %rd1379, %rd927; mov.u64 %rd1380, %rd927; @%p622 bra $L__BB0_463; sqrt.rn.f32 %f3977, %f675; add.f32 %f3978, %f674, %f673; mul.f32 %f3979, %f3978, 0f3F000000; add.f32 %f3980, %f3979, %f3977; sub.f32 %f3981, %f3979, %f3977; mov.b32 %r1391, %f3980; mov.b32 %r1392, %f3981; cvt.u64.u32 %rd935, %r1392; cvt.u64.u32 %rd936, %r1391; bfi.b64 %rd937, %rd935, %rd936, 32, 32; shr.u64 %rd1379, %rd937, 32; shl.b64 %rd1378, %rd937, 32; mov.u64 %rd1380, 1; $L__BB0_463: or.b64 %rd259, %rd1380, %rd1378; or.b64 %rd260, %rd927, %rd1379; cvt.u32.u64 %r1393, %rd1378; cvt.u32.u64 %r1394, %rd1380; or.b32 %r1395, %r1394, %r1393; setp.eq.s32 %p623, %r1395, 0; @%p623 bra $L__BB0_465; mov.b64 {%r1396, %r1397}, %rd260; mov.b64 {%r1398, %r1399}, %rd259; mov.b32 %f5584, %r1399; mov.b32 %f5581, %r1396; $L__BB0_466: mul.f32 %f3982, %f660, %f5581; mul.f32 %f3983, %f660, %f5584; setp.le.f32 %p624, %f3983, %f3982; selp.f32 %f3984, %f3983, %f3982, %p624; setp.ge.f32 %p625, %f3983, %f3982; selp.f32 %f3985, %f3983, %f3982, %p625; ld.global.f32 %f3986, [%rd67+84]; setp.gt.f32 %p626, %f3985, %f3986; sub.f32 %f3987, %f3985, %f3984; mul.f32 %f3988, %f3987, 0f3F000000; ld.global.f32 %f3989, [%rd67+88]; setp.gt.f32 %p627, %f3988, %f3989; or.pred %p838, %p626, %p627; $L__BB0_467: selp.b32 %r10, 0, %r10, %p838; $L__BB0_468: mov.b32 %f681, %r10; and.b16 %rs31, %rs5, 3; setp.eq.s16 %p628, %rs31, 1; @%p628 bra $L__BB0_488; setp.eq.s16 %p629, %rs31, 2; @%p629 bra $L__BB0_472; setp.ne.s16 %p630, %rs31, 3; @%p630 bra $L__BB0_503; mov.f32 %f5613, 0f00000000; mov.f32 %f5614, %f5613; mov.f32 %f5615, %f5613; mov.f32 %f5616, %f5613; bra.uni $L__BB0_535; $L__BB0_472: mov.f32 %f5297, 0f3102E308; mov.f32 %f5296, 0fBF317218; mov.f32 %f5295, 0f3FB8AA3B; mov.f32 %f5294, 0f35BFBE8E; mov.f32 %f5293, 0f3F317200; mov.f32 %f5292, 0f3DAAAABD; mov.f32 %f5291, 0f3C4CAF63; mov.f32 %f5290, 0f3B18F0FE; ld.global.f32 %f682, [%rd67+8]; div.rn.f32 %f3997, %f440, %f5546; div.rn.f32 %f683, %f3997, %f440; ld.global.u32 %r264, [%rd67+12]; cvt.rn.f32.s32 %f684, %r264; mul.f32 %f3998, %f684, 0f3F000000; cvt.rzi.f32.f32 %f3999, %f3998; add.f32 %f4000, %f3999, %f3999; sub.f32 %f4001, %f684, %f4000; abs.f32 %f685, %f4001; abs.f32 %f686, %f683; setp.lt.f32 %p631, %f686, 0f00800000; mul.f32 %f4002, %f686, 0f4B800000; selp.f32 %f4003, %f4002, %f686, %p631; selp.f32 %f4004, 0fC3170000, 0fC2FE0000, %p631; mov.b32 %r1400, %f4003; and.b32 %r1401, %r1400, 8388607; or.b32 %r1402, %r1401, 1065353216; mov.b32 %f4005, %r1402; shr.u32 %r1403, %r1400, 23; cvt.rn.f32.u32 %f4006, %r1403; add.f32 %f4007, %f4004, %f4006; setp.gt.f32 %p632, %f4005, 0f3FB504F3; mul.f32 %f4008, %f4005, 0f3F000000; add.f32 %f4009, %f4007, 0f3F800000; selp.f32 %f4010, %f4009, %f4007, %p632; selp.f32 %f4011, %f4008, %f4005, %p632; add.f32 %f4012, %f4011, 0fBF800000; add.f32 %f3995, %f4011, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f3994,%f3995; // end inline asm add.f32 %f4013, %f4012, %f4012; mul.f32 %f4014, %f3994, %f4013; mul.f32 %f4015, %f4014, %f4014; fma.rn.f32 %f4018, %f5290, %f4015, %f5291; fma.rn.f32 %f4020, %f4018, %f4015, %f5292; mul.rn.f32 %f4021, %f4020, %f4015; mul.rn.f32 %f4022, %f4021, %f4014; sub.f32 %f4023, %f4012, %f4014; add.f32 %f4024, %f4023, %f4023; neg.f32 %f4025, %f4014; fma.rn.f32 %f4026, %f4025, %f4012, %f4024; mul.rn.f32 %f4027, %f3994, %f4026; add.f32 %f4028, %f4022, %f4014; sub.f32 %f4029, %f4014, %f4028; add.f32 %f4030, %f4022, %f4029; add.f32 %f4031, %f4027, %f4030; add.f32 %f4032, %f4028, %f4031; sub.f32 %f4033, %f4028, %f4032; add.f32 %f4034, %f4031, %f4033; mul.rn.f32 %f4036, %f4010, %f5293; mul.rn.f32 %f4038, %f4010, %f5294; add.f32 %f4039, %f4036, %f4032; sub.f32 %f4040, %f4036, %f4039; add.f32 %f4041, %f4032, %f4040; add.f32 %f4042, %f4034, %f4041; add.f32 %f4043, %f4038, %f4042; add.f32 %f4044, %f4039, %f4043; sub.f32 %f4045, %f4039, %f4044; add.f32 %f4046, %f4043, %f4045; abs.f32 %f687, %f684; setp.gt.f32 %p633, %f687, 0f77F684DF; mul.f32 %f4047, %f684, 0f39000000; selp.f32 %f4048, %f4047, %f684, %p633; mul.rn.f32 %f4049, %f4048, %f4044; neg.f32 %f4050, %f4049; fma.rn.f32 %f4051, %f4048, %f4044, %f4050; fma.rn.f32 %f4052, %f4048, %f4046, %f4051; mov.f32 %f4053, 0f00000000; fma.rn.f32 %f4054, %f4053, %f4044, %f4052; add.rn.f32 %f4055, %f4049, %f4054; neg.f32 %f4056, %f4055; add.rn.f32 %f4057, %f4049, %f4056; add.rn.f32 %f4058, %f4057, %f4054; mov.b32 %r1404, %f4055; setp.eq.s32 %p634, %r1404, 1118925336; add.s32 %r1405, %r1404, -1; mov.b32 %f4059, %r1405; add.f32 %f4060, %f4058, 0f37000000; selp.f32 %f688, %f4060, %f4058, %p634; selp.f32 %f4061, %f4059, %f4055, %p634; mul.rn.f32 %f4063, %f4061, %f5295; cvt.rzi.f32.f32 %f4064, %f4063; abs.f32 %f4065, %f4064; setp.gt.f32 %p635, %f4065, 0f42FC0000; mov.b32 %r1406, %f4064; and.b32 %r1407, %r1406, -2147483648; or.b32 %r1408, %r1407, 1123811328; mov.b32 %f4066, %r1408; selp.f32 %f4067, %f4066, %f4064, %p635; fma.rn.f32 %f4069, %f4067, %f5296, %f4061; fma.rn.f32 %f4071, %f4067, %f5297, %f4069; mul.f32 %f4072, %f4071, 0f3FB8AA3B; add.f32 %f4073, %f4067, 0f4B40007F; mov.b32 %r1409, %f4073; shl.b32 %r1410, %r1409, 23; mov.b32 %f4074, %r1410; ex2.approx.ftz.f32 %f4075, %f4072; mul.f32 %f689, %f4075, %f4074; setp.eq.f32 %p636, %f689, 0f7F800000; mov.f32 %f5586, 0f7F800000; @%p636 bra $L__BB0_474; fma.rn.f32 %f5586, %f689, %f688, %f689; $L__BB0_474: setp.lt.f32 %p637, %f683, 0f00000000; setp.eq.f32 %p638, %f685, 0f3F800000; and.pred %p16, %p637, %p638; setp.eq.f32 %p639, %f683, 0f00000000; @%p639 bra $L__BB0_478; bra.uni $L__BB0_475; $L__BB0_478: add.f32 %f4079, %f683, %f683; mov.b32 %r1413, %f4079; selp.b32 %r1414, %r1413, 0, %p638; or.b32 %r1415, %r1414, 2139095040; setp.lt.s32 %p643, %r264, 0; selp.b32 %r1416, %r1415, %r1414, %p643; mov.b32 %f5588, %r1416; bra.uni $L__BB0_479; $L__BB0_488: mul.f32 %f5306, %f5530, %f5531; mov.f32 %f5305, 0f3102E308; mov.f32 %f5304, 0fBF317218; mov.f32 %f5303, 0f3FB8AA3B; mov.f32 %f5302, 0f35BFBE8E; mov.f32 %f5301, 0f3F317200; mov.f32 %f5300, 0f3DAAAABD; mov.f32 %f5299, 0f3C4CAF63; mov.f32 %f5298, 0f3B18F0FE; ld.global.u64 %rd943, [%rd67+24]; mul.wide.u32 %rd944, %r8, 16; add.s64 %rd945, %rd943, %rd944; ld.f32 %f4123, [%rd945+8]; mul.f32 %f4124, %f681, 0f3F7FBE77; fma.rn.f32 %f712, %f4124, %f681, 0f3A83126F; mul.f32 %f4125, %f5546, %f5635; sub.f32 %f713, %f4125, %f5306; ld.global.f32 %f4126, [%rd67+16]; mul.f32 %f4127, %f4126, 0f3F2AAAAB; ld.global.f32 %f4128, [%rd67+12]; mul.f32 %f4129, %f4123, %f4128; fma.rn.f32 %f714, %f4123, %f4127, %f4129; mul.f32 %f4130, %f5530, %f5530; fma.rn.f32 %f715, %f5546, %f5546, %f4130; mul.f32 %f4131, %f5530, %f5635; fma.rn.f32 %f716, %f5546, %f5531, %f4131; mul.f32 %f4132, %f5635, %f5635; fma.rn.f32 %f717, %f5531, %f5531, %f4132; mul.f32 %f718, %f4123, %f4126; mov.f32 %f4133, 0fBF000000; cvt.rzi.f32.f32 %f4134, %f4133; add.f32 %f4135, %f4134, %f4134; mov.f32 %f4136, 0fBF800000; sub.f32 %f4137, %f4136, %f4135; abs.f32 %f719, %f4137; abs.f32 %f720, %f713; setp.lt.f32 %p658, %f720, 0f00800000; mul.f32 %f4138, %f720, 0f4B800000; selp.f32 %f4139, %f4138, %f720, %p658; selp.f32 %f4140, 0fC3170000, 0fC2FE0000, %p658; mov.b32 %r1425, %f4139; and.b32 %r1426, %r1425, 8388607; or.b32 %r1427, %r1426, 1065353216; mov.b32 %f4141, %r1427; shr.u32 %r1428, %r1425, 23; cvt.rn.f32.u32 %f4142, %r1428; add.f32 %f4143, %f4140, %f4142; setp.gt.f32 %p659, %f4141, 0f3FB504F3; mul.f32 %f4144, %f4141, 0f3F000000; add.f32 %f4145, %f4143, 0f3F800000; selp.f32 %f4146, %f4145, %f4143, %p659; selp.f32 %f4147, %f4144, %f4141, %p659; add.f32 %f4148, %f4147, 0fBF800000; add.f32 %f4121, %f4147, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f4120,%f4121; // end inline asm add.f32 %f4149, %f4148, %f4148; mul.f32 %f4150, %f4120, %f4149; mul.f32 %f4151, %f4150, %f4150; fma.rn.f32 %f4154, %f5298, %f4151, %f5299; fma.rn.f32 %f4156, %f4154, %f4151, %f5300; mul.rn.f32 %f4157, %f4156, %f4151; mul.rn.f32 %f4158, %f4157, %f4150; sub.f32 %f4159, %f4148, %f4150; add.f32 %f4160, %f4159, %f4159; neg.f32 %f4161, %f4150; fma.rn.f32 %f4162, %f4161, %f4148, %f4160; mul.rn.f32 %f4163, %f4120, %f4162; add.f32 %f4164, %f4158, %f4150; sub.f32 %f4165, %f4150, %f4164; add.f32 %f4166, %f4158, %f4165; add.f32 %f4167, %f4163, %f4166; add.f32 %f4168, %f4164, %f4167; sub.f32 %f4169, %f4164, %f4168; add.f32 %f4170, %f4167, %f4169; mul.rn.f32 %f4172, %f4146, %f5301; mul.rn.f32 %f4174, %f4146, %f5302; add.f32 %f4175, %f4172, %f4168; sub.f32 %f4176, %f4172, %f4175; add.f32 %f4177, %f4168, %f4176; add.f32 %f4178, %f4170, %f4177; add.f32 %f4179, %f4174, %f4178; add.f32 %f4180, %f4175, %f4179; sub.f32 %f4181, %f4175, %f4180; add.f32 %f4182, %f4179, %f4181; mul.rn.f32 %f4183, %f4136, %f4180; neg.f32 %f4184, %f4183; fma.rn.f32 %f4185, %f4136, %f4180, %f4184; fma.rn.f32 %f4186, %f4136, %f4182, %f4185; mov.f32 %f4187, 0f00000000; fma.rn.f32 %f4188, %f4187, %f4180, %f4186; add.rn.f32 %f4189, %f4183, %f4188; neg.f32 %f4190, %f4189; add.rn.f32 %f4191, %f4183, %f4190; add.rn.f32 %f4192, %f4191, %f4188; mov.b32 %r1429, %f4189; setp.eq.s32 %p660, %r1429, 1118925336; add.s32 %r1430, %r1429, -1; mov.b32 %f4193, %r1430; add.f32 %f4194, %f4192, 0f37000000; selp.f32 %f721, %f4194, %f4192, %p660; selp.f32 %f4195, %f4193, %f4189, %p660; mul.rn.f32 %f4197, %f4195, %f5303; cvt.rzi.f32.f32 %f4198, %f4197; abs.f32 %f4199, %f4198; setp.gt.f32 %p661, %f4199, 0f42FC0000; mov.b32 %r1431, %f4198; and.b32 %r1432, %r1431, -2147483648; or.b32 %r1433, %r1432, 1123811328; mov.b32 %f4200, %r1433; selp.f32 %f4201, %f4200, %f4198, %p661; fma.rn.f32 %f4203, %f4201, %f5304, %f4195; fma.rn.f32 %f4205, %f4201, %f5305, %f4203; mul.f32 %f4206, %f4205, 0f3FB8AA3B; add.f32 %f4207, %f4201, 0f4B40007F; mov.b32 %r1434, %f4207; shl.b32 %r1435, %r1434, 23; mov.b32 %f4208, %r1435; ex2.approx.ftz.f32 %f4209, %f4206; mul.f32 %f722, %f4209, %f4208; setp.eq.f32 %p662, %f722, 0f7F800000; mov.f32 %f5589, 0f7F800000; @%p662 bra $L__BB0_490; fma.rn.f32 %f5589, %f722, %f721, %f722; $L__BB0_490: setp.lt.f32 %p663, %f713, 0f00000000; setp.eq.f32 %p664, %f719, 0f3F800000; and.pred %p17, %p663, %p664; setp.eq.f32 %p665, %f713, 0f00000000; @%p665 bra $L__BB0_494; bra.uni $L__BB0_491; $L__BB0_494: add.f32 %f4214, %f713, %f713; mov.b32 %r1438, %f4214; or.b32 %r1439, %r1438, 2139095040; mov.b32 %f4215, %r1439; selp.f32 %f5591, %f4215, 0f7F800000, %p664; bra.uni $L__BB0_495; $L__BB0_503: mul.f32 %f5241, %f5530, %f5531; add.u64 %rd1278, %SPL, 80; ld.global.u64 %rd951, [%rd67+24]; mul.wide.u32 %rd952, %r8, 16; add.s64 %rd953, %rd951, %rd952; ld.f32 %f762, [%rd953+8]; mul.f32 %f4240, %f5546, %f5635; sub.f32 %f763, %f4240, %f5241; ld.local.v4.f32 {%f5546, %f4242, %f4243, %f4244}, [%rd1278]; add.f32 %f4246, %f4244, %f5546; mul.f32 %f765, %f4246, 0f3F000000; sub.f32 %f4247, %f5546, %f4244; mul.f32 %f4248, %f4247, 0f3F000000; add.f32 %f4251, %f4242, %f4243; mul.f32 %f4252, %f4251, 0f3F000000; sub.f32 %f4253, %f4242, %f4243; mul.f32 %f766, %f4253, 0f3F000000; mul.f32 %f4254, %f766, %f766; fma.rn.f32 %f4255, %f765, %f765, %f4254; sqrt.rn.f32 %f4256, %f4255; mul.f32 %f4257, %f4252, %f4252; fma.rn.f32 %f4258, %f4248, %f4248, %f4257; sqrt.rn.f32 %f4259, %f4258; add.f32 %f767, %f4256, %f4259; sub.f32 %f768, %f4256, %f4259; abs.f32 %f769, %f4248; abs.f32 %f770, %f4252; setp.eq.f32 %p674, %f769, 0f00000000; setp.eq.f32 %p675, %f770, 0f00000000; and.pred %p676, %p674, %p675; mov.b32 %r265, %f4248; mov.b32 %r1442, %f4252; and.b32 %r266, %r1442, -2147483648; @%p676 bra $L__BB0_507; bra.uni $L__BB0_504; $L__BB0_507: shr.s32 %r1447, %r265, 31; and.b32 %r1448, %r1447, 1078530011; or.b32 %r1449, %r1448, %r266; mov.b32 %f5600, %r1449; bra.uni $L__BB0_508; $L__BB0_504: setp.eq.f32 %p677, %f769, 0f7F800000; setp.eq.f32 %p678, %f770, 0f7F800000; and.pred %p679, %p677, %p678; @%p679 bra $L__BB0_506; bra.uni $L__BB0_505; $L__BB0_506: setp.lt.s32 %p683, %r265, 0; selp.b32 %r1445, 1075235812, 1061752795, %p683; or.b32 %r1446, %r1445, %r266; mov.b32 %f5600, %r1446; bra.uni $L__BB0_508; $L__BB0_475: mov.b32 %r1411, %f5586; xor.b32 %r1412, %r1411, -2147483648; mov.b32 %f4076, %r1412; selp.f32 %f5588, %f4076, %f5586, %p16; setp.geu.f32 %p640, %f683, 0f00000000; @%p640 bra $L__BB0_479; cvt.rzi.f32.f32 %f4077, %f684; setp.eq.f32 %p641, %f4077, %f684; @%p641 bra $L__BB0_479; mov.f32 %f5588, 0f7FFFFFFF; $L__BB0_479: add.f32 %f4080, %f686, %f687; mov.b32 %r1417, %f4080; setp.lt.s32 %p644, %r1417, 2139095040; @%p644 bra $L__BB0_486; setp.gtu.f32 %p645, %f686, 0f7F800000; setp.gtu.f32 %p646, %f687, 0f7F800000; or.pred %p647, %p645, %p646; @%p647 bra $L__BB0_485; bra.uni $L__BB0_481; $L__BB0_485: add.f32 %f5588, %f683, %f684; bra.uni $L__BB0_486; $L__BB0_491: mov.b32 %r1436, %f5589; xor.b32 %r1437, %r1436, -2147483648; mov.b32 %f4210, %r1437; selp.f32 %f5591, %f4210, %f5589, %p17; setp.geu.f32 %p666, %f713, 0f00000000; @%p666 bra $L__BB0_495; cvt.rzi.f32.f32 %f4212, %f4136; setp.eq.f32 %p667, %f4212, 0fBF800000; @%p667 bra $L__BB0_495; mov.f32 %f5591, 0f7FFFFFFF; $L__BB0_495: add.f32 %f4216, %f720, 0f3F800000; mov.b32 %r1440, %f4216; setp.lt.s32 %p669, %r1440, 2139095040; @%p669 bra $L__BB0_500; setp.gtu.f32 %p670, %f720, 0f7F800000; @%p670 bra $L__BB0_499; bra.uni $L__BB0_497; $L__BB0_499: add.f32 %f5591, %f713, 0fBF800000; bra.uni $L__BB0_500; $L__BB0_481: setp.eq.f32 %p648, %f687, 0f7F800000; @%p648 bra $L__BB0_484; bra.uni $L__BB0_482; $L__BB0_484: setp.gt.f32 %p651, %f686, 0f3F800000; selp.b32 %r1421, 2139095040, 0, %p651; xor.b32 %r1422, %r1421, 2139095040; setp.lt.s32 %p652, %r264, 0; selp.b32 %r1423, %r1422, %r1421, %p652; mov.b32 %f4081, %r1423; setp.eq.f32 %p653, %f683, 0fBF800000; selp.f32 %f5588, 0f3F800000, %f4081, %p653; bra.uni $L__BB0_486; $L__BB0_497: setp.neu.f32 %p671, %f720, 0f7F800000; @%p671 bra $L__BB0_500; selp.f32 %f5591, 0f80000000, 0f00000000, %p17; $L__BB0_500: add.u64 %rd1286, %SPL, 0; setp.eq.f32 %p672, %f713, 0f3F800000; mov.u32 %r1441, 1065353216; selp.f32 %f4217, 0f3F800000, %f5591, %p672; mul.f32 %f4218, %f718, %f4217; add.f32 %f4219, %f715, 0f00000000; add.f32 %f4220, %f4219, %f717; mul.f32 %f4221, %f4220, 0f3F000000; sub.f32 %f4222, %f715, %f4221; sub.f32 %f4223, %f717, %f4221; mul.f32 %f5592, %f4222, %f4218; mul.f32 %f5593, %f716, %f4218; mul.f32 %f5595, %f4223, %f4218; fma.rn.f32 %f4224, %f713, %f713, 0fBF800000; mul.f32 %f4225, %f714, 0f3F000000; mul.f32 %f4226, %f4224, %f4225; add.u64 %rd947, %SPL, 32; st.local.v4.f32 [%rd947], {%f4187, %f4187, %f4187, %f4187}; mov.u64 %rd948, 0; st.local.v2.u64 [%rd1286], {%rd948, %rd948}; st.local.u32 [%rd1286], %r1441; st.local.u32 [%rd1286+12], %r1441; ld.local.v4.f32 {%f4228, %f4229, %f4230, %f4231}, [%rd1286]; mul.f32 %f5596, %f4226, %f4228; mul.f32 %f5597, %f4226, %f4229; mul.f32 %f5598, %f4226, %f4230; mul.f32 %f5599, %f4226, %f4231; setp.ltu.f32 %p673, %f713, 0f3F800000; mov.f32 %f5594, %f5593; @%p673 bra $L__BB0_502; add.f32 %f5592, %f5592, %f5596; add.f32 %f747, %f5593, %f5597; add.f32 %f5594, %f5593, %f5598; add.f32 %f5595, %f5595, %f5599; st.local.v4.f32 [%rd947], {%f4187, %f4187, %f4187, %f4187}; mov.f32 %f5593, %f747; mov.f32 %f5596, %f4187; mov.f32 %f5597, %f4187; mov.f32 %f5598, %f4187; mov.f32 %f5599, %f4187; $L__BB0_502: fma.rn.f32 %f5613, %f712, %f5592, %f5596; fma.rn.f32 %f5614, %f712, %f5593, %f5597; fma.rn.f32 %f5615, %f712, %f5594, %f5598; fma.rn.f32 %f5616, %f712, %f5595, %f5599; bra.uni $L__BB0_535; $L__BB0_505: setp.lt.s32 %p680, %r265, 0; min.f32 %f4260, %f770, %f769; max.f32 %f4261, %f770, %f769; div.rn.f32 %f4262, %f4260, %f4261; mul.rn.f32 %f4263, %f4262, %f4262; mov.f32 %f4264, 0fC0B59883; mov.f32 %f4265, 0fBF52C7EA; fma.rn.f32 %f4266, %f4263, %f4265, %f4264; mov.f32 %f4267, 0fC0D21907; fma.rn.f32 %f4268, %f4266, %f4263, %f4267; mul.f32 %f4269, %f4263, %f4268; mul.f32 %f4270, %f4262, %f4269; add.f32 %f4271, %f4263, 0f41355DC0; mov.f32 %f4272, 0f41E6BD60; fma.rn.f32 %f4273, %f4271, %f4263, %f4272; mov.f32 %f4274, 0f419D92C8; fma.rn.f32 %f4275, %f4273, %f4263, %f4274; rcp.rn.f32 %f4276, %f4275; fma.rn.f32 %f4277, %f4270, %f4276, %f4262; mov.f32 %f4278, 0f3FC90FDB; sub.f32 %f4279, %f4278, %f4277; setp.gt.f32 %p681, %f770, %f769; selp.f32 %f4280, %f4279, %f4277, %p681; mov.f32 %f4281, 0f40490FDB; sub.f32 %f4282, %f4281, %f4280; selp.f32 %f4283, %f4282, %f4280, %p680; mov.b32 %r1443, %f4283; or.b32 %r1444, %r266, %r1443; mov.b32 %f4284, %r1444; add.f32 %f4285, %f769, %f770; setp.le.f32 %p682, %f4285, 0f7F800000; selp.f32 %f5600, %f4284, %f4285, %p682; $L__BB0_508: abs.f32 %f775, %f765; setp.eq.f32 %p684, %f775, 0f00000000; abs.f32 %f776, %f766; setp.eq.f32 %p685, %f776, 0f00000000; and.pred %p686, %p684, %p685; mov.b32 %r267, %f765; mov.b32 %r1450, %f766; and.b32 %r268, %r1450, -2147483648; @%p686 bra $L__BB0_512; bra.uni $L__BB0_509; $L__BB0_512: shr.s32 %r1455, %r267, 31; and.b32 %r1456, %r1455, 1078530011; or.b32 %r1457, %r1456, %r268; mov.b32 %f5601, %r1457; bra.uni $L__BB0_513; $L__BB0_509: setp.eq.f32 %p687, %f775, 0f7F800000; setp.eq.f32 %p688, %f776, 0f7F800000; and.pred %p689, %p687, %p688; @%p689 bra $L__BB0_511; bra.uni $L__BB0_510; $L__BB0_511: setp.lt.s32 %p693, %r267, 0; selp.b32 %r1453, 1075235812, 1061752795, %p693; or.b32 %r1454, %r1453, %r268; mov.b32 %f5601, %r1454; bra.uni $L__BB0_513; $L__BB0_510: setp.lt.s32 %p690, %r267, 0; min.f32 %f4286, %f776, %f775; max.f32 %f4287, %f776, %f775; div.rn.f32 %f4288, %f4286, %f4287; mul.rn.f32 %f4289, %f4288, %f4288; mov.f32 %f4290, 0fC0B59883; mov.f32 %f4291, 0fBF52C7EA; fma.rn.f32 %f4292, %f4289, %f4291, %f4290; mov.f32 %f4293, 0fC0D21907; fma.rn.f32 %f4294, %f4292, %f4289, %f4293; mul.f32 %f4295, %f4289, %f4294; mul.f32 %f4296, %f4288, %f4295; add.f32 %f4297, %f4289, 0f41355DC0; mov.f32 %f4298, 0f41E6BD60; fma.rn.f32 %f4299, %f4297, %f4289, %f4298; mov.f32 %f4300, 0f419D92C8; fma.rn.f32 %f4301, %f4299, %f4289, %f4300; rcp.rn.f32 %f4302, %f4301; fma.rn.f32 %f4303, %f4296, %f4302, %f4288; mov.f32 %f4304, 0f3FC90FDB; sub.f32 %f4305, %f4304, %f4303; setp.gt.f32 %p691, %f776, %f775; selp.f32 %f4306, %f4305, %f4303, %p691; mov.f32 %f4307, 0f40490FDB; sub.f32 %f4308, %f4307, %f4306; selp.f32 %f4309, %f4308, %f4306, %p690; mov.b32 %r1451, %f4309; or.b32 %r1452, %r268, %r1451; mov.b32 %f4310, %r1452; add.f32 %f4311, %f775, %f776; setp.le.f32 %p692, %f4311, 0f7F800000; selp.f32 %f5601, %f4310, %f4311, %p692; $L__BB0_513: sub.f32 %f4312, %f5601, %f5600; mul.f32 %f781, %f4312, 0f3F000000; add.f32 %f4313, %f5600, %f5601; mul.f32 %f782, %f4313, 0f3F000000; mul.f32 %f4314, %f781, 0f3F22F983; cvt.rni.s32.f32 %r1787, %f4314; cvt.rn.f32.s32 %f4315, %r1787; mov.f32 %f4316, 0fBFC90FDA; fma.rn.f32 %f4317, %f4315, %f4316, %f781; mov.f32 %f4318, 0fB3A22168; fma.rn.f32 %f4319, %f4315, %f4318, %f4317; mov.f32 %f4320, 0fA7C234C5; fma.rn.f32 %f5602, %f4315, %f4320, %f4319; abs.f32 %f784, %f781; setp.leu.f32 %p694, %f784, 0f47CE4780; @%p694 bra $L__BB0_521; setp.eq.f32 %p695, %f784, 0f7F800000; @%p695 bra $L__BB0_520; bra.uni $L__BB0_515; $L__BB0_520: mov.f32 %f4323, 0f00000000; mul.rn.f32 %f5602, %f781, %f4323; bra.uni $L__BB0_521; $L__BB0_515: mov.b32 %r270, %f781; bfe.u32 %r1460, %r270, 23, 8; add.s32 %r271, %r1460, -128; shl.b32 %r1461, %r270, 8; or.b32 %r272, %r1461, -2147483648; shr.u32 %r273, %r271, 5; add.u64 %rd957, %SP, 32; add.u64 %rd1382, %SPL, 32; mov.u32 %r1783, 0; mov.u64 %rd1381, __cudart_i2opi_f; mov.u32 %r1784, %r1783; $L__BB0_516: .pragma "nounroll"; mov.u32 %r275, %r1784; ld.global.nc.u32 %r1464, [%rd1381]; // begin inline asm { mad.lo.cc.u32 %r1462, %r1464, %r272, %r275; madc.hi.u32 %r1784, %r1464, %r272, 0; } // end inline asm st.local.u32 [%rd1382], %r1462; add.s64 %rd1382, %rd1382, 4; add.s64 %rd1381, %rd1381, 4; add.s32 %r1783, %r1783, 1; setp.ne.s32 %p696, %r1783, 6; @%p696 bra $L__BB0_516; mov.u32 %r1469, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1467, %r1469, %r272, %r275; madc.hi.u32 %r1468, %r1469, %r272, 0; } // end inline asm cvta.to.local.u64 %rd959, %rd957; st.local.u32 [%rd959+24], %r1468; mov.u32 %r1472, 4; sub.s32 %r278, %r1472, %r273; mov.u32 %r1473, 6; sub.s32 %r1474, %r1473, %r273; mul.wide.s32 %rd960, %r1474, 4; add.s64 %rd961, %rd959, %rd960; ld.local.u32 %r1785, [%rd961]; ld.local.u32 %r1786, [%rd961+-4]; and.b32 %r281, %r271, 31; setp.eq.s32 %p697, %r281, 0; @%p697 bra $L__BB0_519; mov.u32 %r1475, 32; sub.s32 %r1476, %r1475, %r281; shr.u32 %r1477, %r1786, %r1476; shl.b32 %r1478, %r1785, %r281; add.s32 %r1785, %r1477, %r1478; mul.wide.s32 %rd964, %r278, 4; add.s64 %rd965, %rd959, %rd964; ld.local.u32 %r1479, [%rd965]; shr.u32 %r1480, %r1479, %r1476; shl.b32 %r1481, %r1786, %r281; add.s32 %r1786, %r1480, %r1481; $L__BB0_519: and.b32 %r1482, %r270, -2147483648; shr.u32 %r1483, %r1786, 30; shl.b32 %r1484, %r1785, 2; or.b32 %r1485, %r1483, %r1484; shr.u32 %r1486, %r1485, 31; shr.u32 %r1487, %r1785, 30; add.s32 %r1488, %r1486, %r1487; neg.s32 %r1489, %r1488; setp.eq.s32 %p698, %r1482, 0; selp.b32 %r1787, %r1488, %r1489, %p698; setp.ne.s32 %p699, %r1486, 0; xor.b32 %r1490, %r1482, -2147483648; selp.b32 %r1491, %r1490, %r1482, %p699; selp.b32 %r1492, -1, 0, %p699; xor.b32 %r1493, %r1485, %r1492; shl.b32 %r1494, %r1786, 2; xor.b32 %r1495, %r1494, %r1492; cvt.u64.u32 %rd966, %r1493; cvt.u64.u32 %rd967, %r1495; bfi.b64 %rd968, %rd966, %rd967, 32, 32; cvt.rn.f64.s64 %fd23, %rd968; mul.f64 %fd24, %fd23, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f4321, %fd24; setp.eq.s32 %p700, %r1491, 0; neg.f32 %f4322, %f4321; selp.f32 %f5602, %f4321, %f4322, %p700; $L__BB0_521: mul.f32 %f4324, %f782, 0f3F22F983; cvt.rni.s32.f32 %r1792, %f4324; cvt.rn.f32.s32 %f4325, %r1792; fma.rn.f32 %f4327, %f4325, %f4316, %f782; fma.rn.f32 %f4329, %f4325, %f4318, %f4327; fma.rn.f32 %f5603, %f4325, %f4320, %f4329; abs.f32 %f789, %f782; setp.leu.f32 %p701, %f789, 0f47CE4780; @%p701 bra $L__BB0_529; setp.eq.f32 %p702, %f789, 0f7F800000; @%p702 bra $L__BB0_528; bra.uni $L__BB0_523; $L__BB0_528: mov.f32 %f4333, 0f00000000; mul.rn.f32 %f5603, %f782, %f4333; bra.uni $L__BB0_529; $L__BB0_523: mov.b32 %r289, %f782; bfe.u32 %r1498, %r289, 23, 8; add.s32 %r290, %r1498, -128; shl.b32 %r1499, %r289, 8; or.b32 %r291, %r1499, -2147483648; shr.u32 %r292, %r290, 5; add.u64 %rd970, %SP, 32; add.u64 %rd1384, %SPL, 32; mov.u32 %r1788, 0; mov.u64 %rd1383, __cudart_i2opi_f; mov.u32 %r1789, %r1788; $L__BB0_524: .pragma "nounroll"; mov.u32 %r294, %r1789; ld.global.nc.u32 %r1502, [%rd1383]; // begin inline asm { mad.lo.cc.u32 %r1500, %r1502, %r291, %r294; madc.hi.u32 %r1789, %r1502, %r291, 0; } // end inline asm st.local.u32 [%rd1384], %r1500; add.s64 %rd1384, %rd1384, 4; add.s64 %rd1383, %rd1383, 4; add.s32 %r1788, %r1788, 1; setp.ne.s32 %p703, %r1788, 6; @%p703 bra $L__BB0_524; mov.u32 %r1507, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1505, %r1507, %r291, %r294; madc.hi.u32 %r1506, %r1507, %r291, 0; } // end inline asm cvta.to.local.u64 %rd972, %rd970; st.local.u32 [%rd972+24], %r1506; mov.u32 %r1510, 4; sub.s32 %r297, %r1510, %r292; mov.u32 %r1511, 6; sub.s32 %r1512, %r1511, %r292; mul.wide.s32 %rd973, %r1512, 4; add.s64 %rd974, %rd972, %rd973; ld.local.u32 %r1790, [%rd974]; ld.local.u32 %r1791, [%rd974+-4]; and.b32 %r300, %r290, 31; setp.eq.s32 %p704, %r300, 0; @%p704 bra $L__BB0_527; mov.u32 %r1513, 32; sub.s32 %r1514, %r1513, %r300; shr.u32 %r1515, %r1791, %r1514; shl.b32 %r1516, %r1790, %r300; add.s32 %r1790, %r1515, %r1516; mul.wide.s32 %rd977, %r297, 4; add.s64 %rd978, %rd972, %rd977; ld.local.u32 %r1517, [%rd978]; shr.u32 %r1518, %r1517, %r1514; shl.b32 %r1519, %r1791, %r300; add.s32 %r1791, %r1518, %r1519; $L__BB0_527: and.b32 %r1520, %r289, -2147483648; shr.u32 %r1521, %r1791, 30; shl.b32 %r1522, %r1790, 2; or.b32 %r1523, %r1521, %r1522; shr.u32 %r1524, %r1523, 31; shr.u32 %r1525, %r1790, 30; add.s32 %r1526, %r1524, %r1525; neg.s32 %r1527, %r1526; setp.eq.s32 %p705, %r1520, 0; selp.b32 %r1792, %r1526, %r1527, %p705; setp.ne.s32 %p706, %r1524, 0; xor.b32 %r1528, %r1520, -2147483648; selp.b32 %r1529, %r1528, %r1520, %p706; selp.b32 %r1530, -1, 0, %p706; xor.b32 %r1531, %r1523, %r1530; shl.b32 %r1532, %r1791, 2; xor.b32 %r1533, %r1532, %r1530; cvt.u64.u32 %rd979, %r1531; cvt.u64.u32 %rd980, %r1533; bfi.b64 %rd981, %rd979, %rd980, 32, 32; cvt.rn.f64.s64 %fd25, %rd981; mul.f64 %fd26, %fd25, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f4331, %fd26; setp.eq.s32 %p707, %r1529, 0; neg.f32 %f4332, %f4331; selp.f32 %f5603, %f4331, %f4332, %p707; $L__BB0_529: mov.f32 %f5199, 0f3F800000; setp.lt.f32 %p708, %f768, 0f00000000; mov.f32 %f4334, 0f00000000; selp.f32 %f4335, 0fBF800000, 0f3F800000, %p708; mul.f32 %f4337, %f5602, %f5602; mov.f32 %f4338, 0fBAB607ED; mov.f32 %f4339, 0f37CBAC00; fma.rn.f32 %f4340, %f4339, %f4337, %f4338; mov.f32 %f4341, 0f3D2AAABB; fma.rn.f32 %f4342, %f4340, %f4337, %f4341; mov.f32 %f4343, 0fBEFFFFFF; fma.rn.f32 %f4344, %f4342, %f4337, %f4343; fma.rn.f32 %f4345, %f4344, %f4337, %f5199; mov.f32 %f4346, 0f3C0885E4; mov.f32 %f4347, 0fB94D4153; fma.rn.f32 %f4348, %f4347, %f4337, %f4346; mov.f32 %f4349, 0fBE2AAAA8; fma.rn.f32 %f4350, %f4348, %f4337, %f4349; fma.rn.f32 %f4351, %f4337, %f5602, %f4334; fma.rn.f32 %f4352, %f4350, %f4351, %f5602; and.b32 %r1534, %r1787, 1; setp.eq.b32 %p709, %r1534, 1; selp.f32 %f4353, %f4345, %f4352, %p709; selp.f32 %f4354, %f4352, %f4345, %p709; neg.f32 %f4355, %f4353; and.b32 %r1535, %r1787, 2; setp.eq.s32 %p710, %r1535, 0; selp.f32 %f4356, %f4353, %f4355, %p710; neg.f32 %f4357, %f4354; add.s32 %r1536, %r1787, 1; and.b32 %r1537, %r1536, 2; setp.eq.s32 %p711, %r1537, 0; selp.f32 %f4358, %f4354, %f4357, %p711; mul.f32 %f4359, %f5603, %f5603; fma.rn.f32 %f4360, %f4339, %f4359, %f4338; fma.rn.f32 %f4361, %f4360, %f4359, %f4341; fma.rn.f32 %f4362, %f4361, %f4359, %f4343; fma.rn.f32 %f4363, %f4362, %f4359, %f5199; fma.rn.f32 %f4364, %f4359, %f5603, %f4334; fma.rn.f32 %f4365, %f4347, %f4359, %f4346; fma.rn.f32 %f4366, %f4365, %f4359, %f4349; fma.rn.f32 %f4367, %f4366, %f4364, %f5603; and.b32 %r1538, %r1792, 1; setp.eq.b32 %p712, %r1538, 1; selp.f32 %f4368, %f4363, %f4367, %p712; selp.f32 %f4369, %f4367, %f4363, %p712; and.b32 %r1539, %r1792, 2; setp.eq.s32 %p713, %r1539, 0; neg.f32 %f4370, %f4368; selp.f32 %f4371, %f4368, %f4370, %p713; add.s32 %r1540, %r1792, 1; and.b32 %r1541, %r1540, 2; setp.eq.s32 %p714, %r1541, 0; neg.f32 %f4372, %f4369; selp.f32 %f4373, %f4369, %f4372, %p714; mov.b32 %r1542, %f4373; neg.f32 %f4374, %f4371; mov.b32 %r1543, %f4371; cvt.u64.u32 %rd982, %r1543; mov.b32 %r1544, %f4374; cvt.u64.u32 %rd983, %r1544; cvt.u64.u32 %rd984, %r1542; bfi.b64 %rd985, %rd984, %rd983, 32, 32; mov.b64 {%r1545, %r1546}, %rd985; bfi.b64 %rd986, %rd982, %rd984, 32, 32; mov.b64 {%r1547, %r1548}, %rd986; mul.f32 %f4375, %f4335, %f4356; mov.b32 %r1549, %f4375; cvt.u64.u32 %rd987, %r1549; mov.b32 %r1550, %f4358; cvt.u64.u32 %rd988, %r1550; neg.f32 %f4376, %f4356; mov.b32 %r1551, %f4376; mul.f32 %f4377, %f4335, %f4358; mov.b32 %r1552, %f4377; cvt.u64.u32 %rd989, %r1552; cvt.u64.u32 %rd990, %r1551; bfi.b64 %rd991, %rd989, %rd990, 32, 32; mov.b64 {%r1553, %r1554}, %rd991; bfi.b64 %rd992, %rd987, %rd988, 32, 32; mov.b64 {%r1555, %r1556}, %rd992; add.f32 %f793, %f767, 0fBF800000; fma.rn.f32 %f794, %f768, %f4335, 0fBF800000; mov.b32 %f795, %r1547; mov.b32 %f796, %r1548; mov.b32 %f797, %r1545; mov.b32 %f798, %r1546; mov.b32 %f799, %r1555; mov.b32 %f800, %r1556; mov.b32 %f801, %r1553; mov.b32 %f802, %r1554; add.f32 %f803, %f763, 0fBF800000; setp.eq.f32 %p715, %f681, 0f3F800000; @%p715 bra $L__BB0_534; bra.uni $L__BB0_530; $L__BB0_534: add.u64 %rd1282, %SPL, 0; ld.global.f32 %f4439, [%rd67+20]; add.f32 %f4440, %f4439, %f4439; mul.f32 %f4441, %f762, %f4440; mul.f32 %f4442, %f793, %f795; mul.f32 %f4443, %f793, %f796; mul.f32 %f4444, %f794, %f797; mul.f32 %f4445, %f800, %f4444; fma.rn.f32 %f4446, %f799, %f4442, %f4445; mul.f32 %f4447, %f794, %f798; mul.f32 %f4448, %f800, %f4447; fma.rn.f32 %f4449, %f799, %f4443, %f4448; mul.f32 %f4450, %f802, %f4444; fma.rn.f32 %f4451, %f801, %f4442, %f4450; mul.f32 %f4452, %f802, %f4447; fma.rn.f32 %f4453, %f801, %f4443, %f4452; mul.f32 %f4454, %f4446, %f4441; mul.f32 %f4455, %f4449, %f4441; mul.f32 %f4456, %f4451, %f4441; mul.f32 %f4457, %f4453, %f4441; mul.f32 %f4458, %f5530, %f4456; fma.rn.f32 %f4459, %f5546, %f4454, %f4458; mul.f32 %f4460, %f5530, %f4457; fma.rn.f32 %f4461, %f5546, %f4455, %f4460; mul.f32 %f4462, %f4456, %f5635; fma.rn.f32 %f4463, %f4454, %f5531, %f4462; mul.f32 %f4464, %f4457, %f5635; fma.rn.f32 %f4465, %f4455, %f5531, %f4464; ld.global.f32 %f4466, [%rd67+16]; mul.f32 %f4467, %f762, %f4466; mul.f32 %f4468, %f803, %f4467; mul.f32 %f4469, %f763, %f4468; add.u64 %rd997, %SPL, 32; mov.u64 %rd998, 0; st.local.v2.u64 [%rd997], {%rd998, %rd998}; mov.u32 %r1558, 1065353216; st.local.u32 [%rd997], %r1558; st.local.u32 [%rd997+12], %r1558; ld.local.v4.f32 {%f4470, %f4471, %f4472, %f4473}, [%rd997]; fma.rn.f32 %f5616, %f4469, %f4473, %f4465; fma.rn.f32 %f5615, %f4469, %f4472, %f4463; fma.rn.f32 %f5614, %f4469, %f4471, %f4461; fma.rn.f32 %f5613, %f4469, %f4470, %f4459; st.local.v4.f32 [%rd1282], {%f5613, %f5614, %f5615, %f5616}; bra.uni $L__BB0_535; $L__BB0_530: add.u64 %rd1280, %SPL, 0; ld.global.f32 %f4378, [%rd67+20]; add.f32 %f4379, %f4378, %f4378; mul.f32 %f4380, %f762, %f4379; max.f32 %f4382, %f793, %f4334; mul.f32 %f4383, %f795, %f4382; mul.f32 %f4384, %f796, %f4382; max.f32 %f4385, %f794, %f4334; mul.f32 %f4386, %f797, %f4385; mul.f32 %f4387, %f798, %f4385; mul.f32 %f4388, %f800, %f4386; fma.rn.f32 %f4389, %f799, %f4383, %f4388; mul.f32 %f4390, %f800, %f4387; fma.rn.f32 %f4391, %f799, %f4384, %f4390; mul.f32 %f4392, %f802, %f4386; fma.rn.f32 %f4393, %f801, %f4383, %f4392; mul.f32 %f4394, %f802, %f4387; fma.rn.f32 %f4395, %f801, %f4384, %f4394; mul.f32 %f4396, %f4389, %f4380; mul.f32 %f4397, %f4391, %f4380; mul.f32 %f4398, %f4393, %f4380; mul.f32 %f4399, %f4395, %f4380; mul.f32 %f4400, %f5530, %f4398; fma.rn.f32 %f5604, %f5546, %f4396, %f4400; mul.f32 %f4401, %f5530, %f4399; fma.rn.f32 %f5605, %f5546, %f4397, %f4401; mul.f32 %f4402, %f4398, %f5635; fma.rn.f32 %f5606, %f4396, %f5531, %f4402; mul.f32 %f4403, %f4399, %f5635; fma.rn.f32 %f5607, %f4397, %f5531, %f4403; min.f32 %f4404, %f793, %f4334; mul.f32 %f4405, %f795, %f4404; mul.f32 %f4406, %f796, %f4404; min.f32 %f4407, %f794, %f4334; mul.f32 %f4408, %f797, %f4407; mul.f32 %f4409, %f798, %f4407; mul.f32 %f4410, %f800, %f4408; fma.rn.f32 %f4411, %f799, %f4405, %f4410; mul.f32 %f4412, %f800, %f4409; fma.rn.f32 %f4413, %f799, %f4406, %f4412; mul.f32 %f4414, %f802, %f4408; fma.rn.f32 %f4415, %f801, %f4405, %f4414; mul.f32 %f4416, %f802, %f4409; fma.rn.f32 %f4417, %f801, %f4406, %f4416; mul.f32 %f4418, %f4380, %f4411; mul.f32 %f4419, %f4380, %f4413; mul.f32 %f4420, %f4380, %f4415; mul.f32 %f4421, %f4380, %f4417; mul.f32 %f4422, %f5530, %f4420; fma.rn.f32 %f5608, %f5546, %f4418, %f4422; mul.f32 %f4423, %f5530, %f4421; fma.rn.f32 %f5609, %f5546, %f4419, %f4423; mul.f32 %f4424, %f4420, %f5635; fma.rn.f32 %f5610, %f4418, %f5531, %f4424; mul.f32 %f4425, %f4421, %f5635; fma.rn.f32 %f5611, %f4419, %f5531, %f4425; ld.global.f32 %f4426, [%rd67+16]; mul.f32 %f4427, %f762, %f4426; mul.f32 %f4428, %f803, %f4427; mul.f32 %f4429, %f763, %f4428; add.u64 %rd994, %SPL, 32; st.local.v4.f32 [%rd994], {%f4334, %f4334, %f4334, %f4334}; mov.u64 %rd995, 0; st.local.v2.u64 [%rd1280], {%rd995, %rd995}; mov.u32 %r1557, 1065353216; st.local.u32 [%rd1280], %r1557; st.local.u32 [%rd1280+12], %r1557; ld.local.v4.f32 {%f4430, %f4431, %f4432, %f4433}, [%rd1280]; mul.f32 %f812, %f4429, %f4430; mul.f32 %f813, %f4429, %f4431; mul.f32 %f814, %f4429, %f4432; mul.f32 %f815, %f4429, %f4433; setp.lt.f32 %p716, %f763, 0f3F800000; @%p716 bra $L__BB0_532; bra.uni $L__BB0_531; $L__BB0_532: add.f32 %f5608, %f5608, %f812; add.f32 %f5609, %f5609, %f813; add.f32 %f5610, %f5610, %f814; add.f32 %f5611, %f5611, %f815; bra.uni $L__BB0_533; $L__BB0_531: add.f32 %f5604, %f5604, %f812; add.f32 %f5605, %f5605, %f813; add.f32 %f5606, %f5606, %f814; add.f32 %f5607, %f5607, %f815; $L__BB0_533: ld.global.u8 %rs32, [%rd67+8]; setp.ne.s16 %p717, %rs32, 0; setp.eq.f32 %p718, %f681, 0f00000000; and.pred %p719, %p718, %p717; selp.f32 %f4438, 0f00000000, 0f3F800000, %p719; fma.rn.f32 %f5613, %f5604, %f4438, %f5608; fma.rn.f32 %f5614, %f5605, %f4438, %f5609; fma.rn.f32 %f5615, %f5606, %f4438, %f5610; fma.rn.f32 %f5616, %f5607, %f4438, %f5611; bra.uni $L__BB0_535; $L__BB0_482: setp.neu.f32 %p649, %f686, 0f7F800000; @%p649 bra $L__BB0_486; setp.gt.s32 %p650, %r264, -1; selp.b32 %r1418, 2139095040, 0, %p650; or.b32 %r1419, %r1418, -2147483648; selp.b32 %r1420, %r1419, %r1418, %p16; mov.b32 %f5588, %r1420; $L__BB0_486: add.u64 %rd1284, %SPL, 0; setp.eq.s32 %p654, %r264, 0; setp.eq.f32 %p655, %f683, 0f3F800000; mov.u32 %r1424, 1065353216; or.pred %p656, %p655, %p654; add.f32 %f4082, %f5588, 0fBF800000; selp.f32 %f4083, 0f00000000, %f4082, %p656; mul.f32 %f4084, %f682, %f4083; ld.global.f32 %f4085, [%rd67+20]; neg.f32 %f4086, %f4085; max.f32 %f4087, %f4084, %f4086; mul.f32 %f4088, %f5532, %f4087; neg.f32 %f4089, %f4088; add.u64 %rd939, %SPL, 32; st.local.v4.f32 [%rd939], {%f4053, %f4053, %f4053, %f4053}; mov.u64 %rd940, 0; st.local.v2.u64 [%rd1284], {%rd940, %rd940}; st.local.u32 [%rd1284], %r1424; st.local.u32 [%rd1284+12], %r1424; ld.local.v4.f32 {%f4091, %f4092, %f4093, %f4094}, [%rd1284]; mul.f32 %f5613, %f4091, %f4089; mul.f32 %f5614, %f4092, %f4089; mul.f32 %f5615, %f4093, %f4089; mul.f32 %f5616, %f4094, %f4089; ld.global.f32 %f707, [%rd67+16]; setp.eq.f32 %p657, %f707, 0f00000000; @%p657 bra $L__BB0_535; add.f32 %f4099, %f5533, %f5533; add.f32 %f4100, %f5535, %f5534; add.f32 %f4101, %f5536, %f5536; mul.f32 %f4102, %f4100, 0f3F000000; mul.f32 %f4103, %f4101, 0f3F000000; mul.f32 %f4104, %f4099, 0f3F000000; add.f32 %f4105, %f4104, 0f00000000; add.f32 %f4106, %f4103, %f4105; mul.f32 %f4107, %f4106, 0f3F000000; st.local.v4.f32 [%rd939], {%f4104, %f4102, %f4102, %f4103}; sub.f32 %f4108, %f4104, %f4107; st.local.f32 [%rd939], %f4108; sub.f32 %f4109, %f4103, %f4107; st.local.f32 [%rd939+12], %f4109; ld.local.v4.f32 {%f4110, %f4111, %f4112, %f4113}, [%rd939]; add.f32 %f4114, %f707, %f707; mul.f32 %f4115, %f5532, %f4114; fma.rn.f32 %f5613, %f4115, %f4110, %f5613; fma.rn.f32 %f5614, %f4115, %f4111, %f5614; fma.rn.f32 %f5615, %f4115, %f4112, %f5615; fma.rn.f32 %f5616, %f4115, %f4113, %f5616; $L__BB0_535: mov.f32 %f5249, 0f3102E308; mov.f32 %f5248, 0fBF317218; mov.f32 %f5247, 0f3FB8AA3B; mov.f32 %f5246, 0f35BFBE8E; mov.f32 %f5245, 0f3F317200; mov.f32 %f5244, 0f3DAAAABD; mov.f32 %f5243, 0f3C4CAF63; mov.f32 %f5242, 0f3B18F0FE; ld.param.f32 %f5213, [g2p2g_param_11]; mov.f32 %f5174, 0f3FC00000; div.rn.f32 %f4481, %f115, %f5213; mov.b32 %r1559, %f4481; and.b32 %r1560, %r1559, -2147483648; or.b32 %r1561, %r1560, 1056964608; mov.b32 %f4482, %r1561; add.rz.f32 %f4483, %f4481, %f4482; cvt.rzi.f32.f32 %f853, %f4483; div.rn.f32 %f4484, %f116, %f5213; mov.b32 %r1562, %f4484; and.b32 %r1563, %r1562, -2147483648; or.b32 %r1564, %r1563, 1056964608; mov.b32 %f4485, %r1564; add.rz.f32 %f4486, %f4484, %f4485; cvt.rzi.f32.f32 %f854, %f4486; add.f32 %f4487, %f853, 0fBF800000; add.f32 %f4488, %f854, 0fBF800000; mul.f32 %f4489, %f5213, %f4487; mul.f32 %f4490, %f5213, %f4488; sub.f32 %f855, %f4489, %f115; sub.f32 %f856, %f4490, %f116; neg.f32 %f4491, %f855; div.rn.f32 %f857, %f4491, %f5213; sub.f32 %f858, %f5174, %f857; abs.f32 %f859, %f858; setp.lt.f32 %p720, %f859, 0f00800000; mul.f32 %f4493, %f859, 0f4B800000; selp.f32 %f4494, %f4493, %f859, %p720; selp.f32 %f4495, 0fC3170000, 0fC2FE0000, %p720; mov.b32 %r1565, %f4494; and.b32 %r1566, %r1565, 8388607; or.b32 %r1567, %r1566, 1065353216; mov.b32 %f4496, %r1567; shr.u32 %r1568, %r1565, 23; cvt.rn.f32.u32 %f4497, %r1568; add.f32 %f4498, %f4495, %f4497; setp.gt.f32 %p721, %f4496, 0f3FB504F3; mul.f32 %f4499, %f4496, 0f3F000000; add.f32 %f4500, %f4498, 0f3F800000; selp.f32 %f4501, %f4500, %f4498, %p721; selp.f32 %f4502, %f4499, %f4496, %p721; add.f32 %f4503, %f4502, 0fBF800000; add.f32 %f4479, %f4502, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f4478,%f4479; // end inline asm add.f32 %f4504, %f4503, %f4503; mul.f32 %f4506, %f4478, %f4504; mul.f32 %f4507, %f4506, %f4506; fma.rn.f32 %f4510, %f5242, %f4507, %f5243; fma.rn.f32 %f4512, %f4510, %f4507, %f5244; mul.rn.f32 %f4513, %f4512, %f4507; mul.rn.f32 %f4514, %f4513, %f4506; sub.f32 %f4515, %f4503, %f4506; add.f32 %f4516, %f4515, %f4515; neg.f32 %f4517, %f4506; fma.rn.f32 %f4518, %f4517, %f4503, %f4516; mul.rn.f32 %f4519, %f4478, %f4518; add.f32 %f4520, %f4514, %f4506; sub.f32 %f4521, %f4506, %f4520; add.f32 %f4522, %f4514, %f4521; add.f32 %f4523, %f4519, %f4522; add.f32 %f4524, %f4520, %f4523; sub.f32 %f4525, %f4520, %f4524; add.f32 %f4526, %f4523, %f4525; mul.rn.f32 %f4528, %f4501, %f5245; mul.rn.f32 %f4530, %f4501, %f5246; add.f32 %f4531, %f4528, %f4524; sub.f32 %f4532, %f4528, %f4531; add.f32 %f4533, %f4524, %f4532; add.f32 %f4534, %f4526, %f4533; add.f32 %f4535, %f4530, %f4534; add.f32 %f4536, %f4531, %f4535; sub.f32 %f4537, %f4531, %f4536; add.f32 %f4538, %f4535, %f4537; mul.rn.f32 %f4539, %f1007, %f4536; neg.f32 %f4540, %f4539; fma.rn.f32 %f4541, %f1007, %f4536, %f4540; fma.rn.f32 %f4542, %f1007, %f4538, %f4541; mov.f32 %f4543, 0f00000000; fma.rn.f32 %f4544, %f4543, %f4536, %f4542; add.rn.f32 %f4545, %f4539, %f4544; neg.f32 %f4546, %f4545; add.rn.f32 %f4547, %f4539, %f4546; add.rn.f32 %f4548, %f4547, %f4544; mov.b32 %r1569, %f4545; setp.eq.s32 %p722, %r1569, 1118925336; add.s32 %r1570, %r1569, -1; mov.b32 %f4549, %r1570; add.f32 %f4550, %f4548, 0f37000000; selp.f32 %f860, %f4550, %f4548, %p722; selp.f32 %f4551, %f4549, %f4545, %p722; mul.rn.f32 %f4553, %f4551, %f5247; cvt.rzi.f32.f32 %f4554, %f4553; abs.f32 %f4555, %f4554; setp.gt.f32 %p723, %f4555, 0f42FC0000; mov.b32 %r1571, %f4554; and.b32 %r1572, %r1571, -2147483648; or.b32 %r1573, %r1572, 1123811328; mov.b32 %f4556, %r1573; selp.f32 %f4557, %f4556, %f4554, %p723; fma.rn.f32 %f4559, %f4557, %f5248, %f4551; fma.rn.f32 %f4561, %f4557, %f5249, %f4559; mul.f32 %f4562, %f4561, 0f3FB8AA3B; add.f32 %f4563, %f4557, 0f4B40007F; mov.b32 %r1574, %f4563; shl.b32 %r1575, %r1574, 23; mov.b32 %f4564, %r1575; ex2.approx.ftz.f32 %f4565, %f4562; mul.f32 %f861, %f4565, %f4564; setp.eq.f32 %p724, %f861, 0f7F800000; mov.f32 %f5617, 0f7F800000; @%p724 bra $L__BB0_537; fma.rn.f32 %f5617, %f861, %f860, %f861; $L__BB0_537: setp.lt.f32 %p725, %f858, 0f00000000; and.pred %p18, %p725, %p55; setp.eq.f32 %p727, %f858, 0f00000000; @%p727 bra $L__BB0_541; bra.uni $L__BB0_538; $L__BB0_541: add.f32 %f4570, %f858, %f858; selp.f32 %f5619, %f4570, 0f00000000, %p55; bra.uni $L__BB0_542; $L__BB0_538: mov.b32 %r1576, %f5617; xor.b32 %r1577, %r1576, -2147483648; mov.b32 %f4566, %r1577; selp.f32 %f5619, %f4566, %f5617, %p18; setp.geu.f32 %p728, %f858, 0f00000000; @%p728 bra $L__BB0_542; cvt.rzi.f32.f32 %f4568, %f1007; setp.eq.f32 %p729, %f4568, 0f40000000; @%p729 bra $L__BB0_542; mov.f32 %f5619, 0f7FFFFFFF; $L__BB0_542: add.f32 %f4571, %f859, 0f40000000; mov.b32 %r1578, %f4571; setp.lt.s32 %p731, %r1578, 2139095040; @%p731 bra $L__BB0_547; setp.gtu.f32 %p732, %f859, 0f7F800000; @%p732 bra $L__BB0_546; bra.uni $L__BB0_544; $L__BB0_546: add.f32 %f5619, %f858, 0f40000000; bra.uni $L__BB0_547; $L__BB0_544: setp.neu.f32 %p733, %f859, 0f7F800000; @%p733 bra $L__BB0_547; selp.f32 %f5619, 0fFF800000, 0f7F800000, %p18; $L__BB0_547: mov.f32 %f5257, 0f3102E308; mov.f32 %f5256, 0fBF317218; mov.f32 %f5255, 0f3FB8AA3B; mov.f32 %f5254, 0f35BFBE8E; mov.f32 %f5253, 0f3F317200; mov.f32 %f5252, 0f3DAAAABD; mov.f32 %f5251, 0f3C4CAF63; mov.f32 %f5250, 0f3B18F0FE; mul.f32 %f4575, %f5619, 0f3F000000; setp.eq.f32 %p734, %f858, 0f3F800000; selp.f32 %f870, 0f3F000000, %f4575, %p734; add.f32 %f871, %f857, 0fBF800000; abs.f32 %f872, %f871; setp.lt.f32 %p735, %f872, 0f00800000; mul.f32 %f4576, %f872, 0f4B800000; selp.f32 %f4577, %f4576, %f872, %p735; selp.f32 %f4578, 0fC3170000, 0fC2FE0000, %p735; mov.b32 %r1579, %f4577; and.b32 %r1580, %r1579, 8388607; or.b32 %r1581, %r1580, 1065353216; mov.b32 %f4579, %r1581; shr.u32 %r1582, %r1579, 23; cvt.rn.f32.u32 %f4580, %r1582; add.f32 %f4581, %f4578, %f4580; setp.gt.f32 %p736, %f4579, 0f3FB504F3; mul.f32 %f4582, %f4579, 0f3F000000; add.f32 %f4583, %f4581, 0f3F800000; selp.f32 %f4584, %f4583, %f4581, %p736; selp.f32 %f4585, %f4582, %f4579, %p736; add.f32 %f4586, %f4585, 0fBF800000; add.f32 %f4573, %f4585, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f4572,%f4573; // end inline asm add.f32 %f4587, %f4586, %f4586; mul.f32 %f4589, %f4572, %f4587; mul.f32 %f4590, %f4589, %f4589; fma.rn.f32 %f4593, %f5250, %f4590, %f5251; fma.rn.f32 %f4595, %f4593, %f4590, %f5252; mul.rn.f32 %f4596, %f4595, %f4590; mul.rn.f32 %f4597, %f4596, %f4589; sub.f32 %f4598, %f4586, %f4589; add.f32 %f4599, %f4598, %f4598; neg.f32 %f4600, %f4589; fma.rn.f32 %f4601, %f4600, %f4586, %f4599; mul.rn.f32 %f4602, %f4572, %f4601; add.f32 %f4603, %f4597, %f4589; sub.f32 %f4604, %f4589, %f4603; add.f32 %f4605, %f4597, %f4604; add.f32 %f4606, %f4602, %f4605; add.f32 %f4607, %f4603, %f4606; sub.f32 %f4608, %f4603, %f4607; add.f32 %f4609, %f4606, %f4608; mul.rn.f32 %f4611, %f4584, %f5253; mul.rn.f32 %f4613, %f4584, %f5254; add.f32 %f4614, %f4611, %f4607; sub.f32 %f4615, %f4611, %f4614; add.f32 %f4616, %f4607, %f4615; add.f32 %f4617, %f4609, %f4616; add.f32 %f4618, %f4613, %f4617; add.f32 %f4619, %f4614, %f4618; sub.f32 %f4620, %f4614, %f4619; add.f32 %f4621, %f4618, %f4620; mul.rn.f32 %f4622, %f1007, %f4619; neg.f32 %f4623, %f4622; fma.rn.f32 %f4624, %f1007, %f4619, %f4623; fma.rn.f32 %f4625, %f1007, %f4621, %f4624; fma.rn.f32 %f4627, %f4543, %f4619, %f4625; add.rn.f32 %f4628, %f4622, %f4627; neg.f32 %f4629, %f4628; add.rn.f32 %f4630, %f4622, %f4629; add.rn.f32 %f4631, %f4630, %f4627; mov.b32 %r1583, %f4628; setp.eq.s32 %p737, %r1583, 1118925336; add.s32 %r1584, %r1583, -1; mov.b32 %f4632, %r1584; add.f32 %f4633, %f4631, 0f37000000; selp.f32 %f873, %f4633, %f4631, %p737; selp.f32 %f4634, %f4632, %f4628, %p737; mul.rn.f32 %f4636, %f4634, %f5255; cvt.rzi.f32.f32 %f4637, %f4636; abs.f32 %f4638, %f4637; setp.gt.f32 %p738, %f4638, 0f42FC0000; mov.b32 %r1585, %f4637; and.b32 %r1586, %r1585, -2147483648; or.b32 %r1587, %r1586, 1123811328; mov.b32 %f4639, %r1587; selp.f32 %f4640, %f4639, %f4637, %p738; fma.rn.f32 %f4642, %f4640, %f5256, %f4634; fma.rn.f32 %f4644, %f4640, %f5257, %f4642; mul.f32 %f4645, %f4644, 0f3FB8AA3B; add.f32 %f4646, %f4640, 0f4B40007F; mov.b32 %r1588, %f4646; shl.b32 %r1589, %r1588, 23; mov.b32 %f4647, %r1589; ex2.approx.ftz.f32 %f4648, %f4645; mul.f32 %f874, %f4648, %f4647; setp.eq.f32 %p739, %f874, 0f7F800000; mov.f32 %f5620, 0f7F800000; @%p739 bra $L__BB0_549; fma.rn.f32 %f5620, %f874, %f873, %f874; $L__BB0_549: setp.lt.f32 %p740, %f871, 0f00000000; and.pred %p19, %p740, %p55; setp.eq.f32 %p742, %f871, 0f00000000; @%p742 bra $L__BB0_553; bra.uni $L__BB0_550; $L__BB0_553: add.f32 %f4653, %f871, %f871; selp.f32 %f5622, %f4653, 0f00000000, %p55; bra.uni $L__BB0_554; $L__BB0_550: mov.b32 %r1590, %f5620; xor.b32 %r1591, %r1590, -2147483648; mov.b32 %f4649, %r1591; selp.f32 %f5622, %f4649, %f5620, %p19; setp.geu.f32 %p743, %f871, 0f00000000; @%p743 bra $L__BB0_554; cvt.rzi.f32.f32 %f4651, %f1007; setp.eq.f32 %p744, %f4651, 0f40000000; @%p744 bra $L__BB0_554; mov.f32 %f5622, 0f7FFFFFFF; $L__BB0_554: add.f32 %f4654, %f872, 0f40000000; mov.b32 %r1592, %f4654; setp.lt.s32 %p746, %r1592, 2139095040; @%p746 bra $L__BB0_559; setp.gtu.f32 %p747, %f872, 0f7F800000; @%p747 bra $L__BB0_558; bra.uni $L__BB0_556; $L__BB0_558: add.f32 %f5622, %f871, 0f40000000; bra.uni $L__BB0_559; $L__BB0_556: setp.neu.f32 %p748, %f872, 0f7F800000; @%p748 bra $L__BB0_559; selp.f32 %f5622, 0fFF800000, 0f7F800000, %p19; $L__BB0_559: mov.f32 %f5265, 0f3102E308; mov.f32 %f5264, 0fBF317218; mov.f32 %f5263, 0f3FB8AA3B; mov.f32 %f5262, 0f35BFBE8E; mov.f32 %f5261, 0f3F317200; mov.f32 %f5260, 0f3DAAAABD; mov.f32 %f5259, 0f3C4CAF63; mov.f32 %f5258, 0f3B18F0FE; mov.f32 %f5175, 0f3F400000; sub.f32 %f4659, %f5175, %f5622; setp.eq.f32 %p749, %f871, 0f3F800000; selp.f32 %f883, 0fBE800000, %f4659, %p749; add.f32 %f884, %f857, 0fBF000000; abs.f32 %f885, %f884; setp.lt.f32 %p750, %f885, 0f00800000; mul.f32 %f4660, %f885, 0f4B800000; selp.f32 %f4661, %f4660, %f885, %p750; selp.f32 %f4662, 0fC3170000, 0fC2FE0000, %p750; mov.b32 %r1593, %f4661; and.b32 %r1594, %r1593, 8388607; or.b32 %r1595, %r1594, 1065353216; mov.b32 %f4663, %r1595; shr.u32 %r1596, %r1593, 23; cvt.rn.f32.u32 %f4664, %r1596; add.f32 %f4665, %f4662, %f4664; setp.gt.f32 %p751, %f4663, 0f3FB504F3; mul.f32 %f4666, %f4663, 0f3F000000; add.f32 %f4667, %f4665, 0f3F800000; selp.f32 %f4668, %f4667, %f4665, %p751; selp.f32 %f4669, %f4666, %f4663, %p751; add.f32 %f4670, %f4669, 0fBF800000; add.f32 %f4656, %f4669, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f4655,%f4656; // end inline asm add.f32 %f4671, %f4670, %f4670; mul.f32 %f4673, %f4655, %f4671; mul.f32 %f4674, %f4673, %f4673; fma.rn.f32 %f4677, %f5258, %f4674, %f5259; fma.rn.f32 %f4679, %f4677, %f4674, %f5260; mul.rn.f32 %f4680, %f4679, %f4674; mul.rn.f32 %f4681, %f4680, %f4673; sub.f32 %f4682, %f4670, %f4673; add.f32 %f4683, %f4682, %f4682; neg.f32 %f4684, %f4673; fma.rn.f32 %f4685, %f4684, %f4670, %f4683; mul.rn.f32 %f4686, %f4655, %f4685; add.f32 %f4687, %f4681, %f4673; sub.f32 %f4688, %f4673, %f4687; add.f32 %f4689, %f4681, %f4688; add.f32 %f4690, %f4686, %f4689; add.f32 %f4691, %f4687, %f4690; sub.f32 %f4692, %f4687, %f4691; add.f32 %f4693, %f4690, %f4692; mul.rn.f32 %f4695, %f4668, %f5261; mul.rn.f32 %f4697, %f4668, %f5262; add.f32 %f4698, %f4695, %f4691; sub.f32 %f4699, %f4695, %f4698; add.f32 %f4700, %f4691, %f4699; add.f32 %f4701, %f4693, %f4700; add.f32 %f4702, %f4697, %f4701; add.f32 %f4703, %f4698, %f4702; sub.f32 %f4704, %f4698, %f4703; add.f32 %f4705, %f4702, %f4704; mul.rn.f32 %f4706, %f1007, %f4703; neg.f32 %f4707, %f4706; fma.rn.f32 %f4708, %f1007, %f4703, %f4707; fma.rn.f32 %f4709, %f1007, %f4705, %f4708; fma.rn.f32 %f4711, %f4543, %f4703, %f4709; add.rn.f32 %f4712, %f4706, %f4711; neg.f32 %f4713, %f4712; add.rn.f32 %f4714, %f4706, %f4713; add.rn.f32 %f4715, %f4714, %f4711; mov.b32 %r1597, %f4712; setp.eq.s32 %p752, %r1597, 1118925336; add.s32 %r1598, %r1597, -1; mov.b32 %f4716, %r1598; add.f32 %f4717, %f4715, 0f37000000; selp.f32 %f886, %f4717, %f4715, %p752; selp.f32 %f4718, %f4716, %f4712, %p752; mul.rn.f32 %f4720, %f4718, %f5263; cvt.rzi.f32.f32 %f4721, %f4720; abs.f32 %f4722, %f4721; setp.gt.f32 %p753, %f4722, 0f42FC0000; mov.b32 %r1599, %f4721; and.b32 %r1600, %r1599, -2147483648; or.b32 %r1601, %r1600, 1123811328; mov.b32 %f4723, %r1601; selp.f32 %f4724, %f4723, %f4721, %p753; fma.rn.f32 %f4726, %f4724, %f5264, %f4718; fma.rn.f32 %f4728, %f4724, %f5265, %f4726; mul.f32 %f4729, %f4728, 0f3FB8AA3B; add.f32 %f4730, %f4724, 0f4B40007F; mov.b32 %r1602, %f4730; shl.b32 %r1603, %r1602, 23; mov.b32 %f4731, %r1603; ex2.approx.ftz.f32 %f4732, %f4729; mul.f32 %f887, %f4732, %f4731; setp.eq.f32 %p754, %f887, 0f7F800000; mov.f32 %f5623, 0f7F800000; @%p754 bra $L__BB0_561; fma.rn.f32 %f5623, %f887, %f886, %f887; $L__BB0_561: setp.lt.f32 %p755, %f884, 0f00000000; and.pred %p20, %p755, %p55; setp.eq.f32 %p757, %f884, 0f00000000; @%p757 bra $L__BB0_565; bra.uni $L__BB0_562; $L__BB0_565: add.f32 %f4737, %f884, %f884; selp.f32 %f5625, %f4737, 0f00000000, %p55; bra.uni $L__BB0_566; $L__BB0_562: mov.b32 %r1604, %f5623; xor.b32 %r1605, %r1604, -2147483648; mov.b32 %f4733, %r1605; selp.f32 %f5625, %f4733, %f5623, %p20; setp.geu.f32 %p758, %f884, 0f00000000; @%p758 bra $L__BB0_566; cvt.rzi.f32.f32 %f4735, %f1007; setp.eq.f32 %p759, %f4735, 0f40000000; @%p759 bra $L__BB0_566; mov.f32 %f5625, 0f7FFFFFFF; $L__BB0_566: add.f32 %f4738, %f885, 0f40000000; mov.b32 %r1606, %f4738; setp.lt.s32 %p761, %r1606, 2139095040; @%p761 bra $L__BB0_571; setp.gtu.f32 %p762, %f885, 0f7F800000; @%p762 bra $L__BB0_570; bra.uni $L__BB0_568; $L__BB0_570: add.f32 %f5625, %f884, 0f40000000; bra.uni $L__BB0_571; $L__BB0_568: setp.neu.f32 %p763, %f885, 0f7F800000; @%p763 bra $L__BB0_571; selp.f32 %f5625, 0fFF800000, 0f7F800000, %p20; $L__BB0_571: mov.f32 %f5273, 0f3102E308; mov.f32 %f5272, 0fBF317218; mov.f32 %f5271, 0f3FB8AA3B; mov.f32 %f5270, 0f35BFBE8E; mov.f32 %f5269, 0f3F317200; mov.f32 %f5268, 0f3DAAAABD; mov.f32 %f5267, 0f3C4CAF63; mov.f32 %f5266, 0f3B18F0FE; ld.param.f32 %f5214, [g2p2g_param_11]; mov.f32 %f5176, 0f3FC00000; mul.f32 %f4742, %f5625, 0f3F000000; setp.eq.f32 %p764, %f884, 0f3F800000; selp.f32 %f896, 0f3F000000, %f4742, %p764; neg.f32 %f4743, %f856; div.rn.f32 %f897, %f4743, %f5214; sub.f32 %f898, %f5176, %f897; abs.f32 %f899, %f898; setp.lt.f32 %p765, %f899, 0f00800000; mul.f32 %f4745, %f899, 0f4B800000; selp.f32 %f4746, %f4745, %f899, %p765; selp.f32 %f4747, 0fC3170000, 0fC2FE0000, %p765; mov.b32 %r1607, %f4746; and.b32 %r1608, %r1607, 8388607; or.b32 %r1609, %r1608, 1065353216; mov.b32 %f4748, %r1609; shr.u32 %r1610, %r1607, 23; cvt.rn.f32.u32 %f4749, %r1610; add.f32 %f4750, %f4747, %f4749; setp.gt.f32 %p766, %f4748, 0f3FB504F3; mul.f32 %f4751, %f4748, 0f3F000000; add.f32 %f4752, %f4750, 0f3F800000; selp.f32 %f4753, %f4752, %f4750, %p766; selp.f32 %f4754, %f4751, %f4748, %p766; add.f32 %f4755, %f4754, 0fBF800000; add.f32 %f4740, %f4754, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f4739,%f4740; // end inline asm add.f32 %f4756, %f4755, %f4755; mul.f32 %f4758, %f4739, %f4756; mul.f32 %f4759, %f4758, %f4758; fma.rn.f32 %f4762, %f5266, %f4759, %f5267; fma.rn.f32 %f4764, %f4762, %f4759, %f5268; mul.rn.f32 %f4765, %f4764, %f4759; mul.rn.f32 %f4766, %f4765, %f4758; sub.f32 %f4767, %f4755, %f4758; add.f32 %f4768, %f4767, %f4767; neg.f32 %f4769, %f4758; fma.rn.f32 %f4770, %f4769, %f4755, %f4768; mul.rn.f32 %f4771, %f4739, %f4770; add.f32 %f4772, %f4766, %f4758; sub.f32 %f4773, %f4758, %f4772; add.f32 %f4774, %f4766, %f4773; add.f32 %f4775, %f4771, %f4774; add.f32 %f4776, %f4772, %f4775; sub.f32 %f4777, %f4772, %f4776; add.f32 %f4778, %f4775, %f4777; mul.rn.f32 %f4780, %f4753, %f5269; mul.rn.f32 %f4782, %f4753, %f5270; add.f32 %f4783, %f4780, %f4776; sub.f32 %f4784, %f4780, %f4783; add.f32 %f4785, %f4776, %f4784; add.f32 %f4786, %f4778, %f4785; add.f32 %f4787, %f4782, %f4786; add.f32 %f4788, %f4783, %f4787; sub.f32 %f4789, %f4783, %f4788; add.f32 %f4790, %f4787, %f4789; mul.rn.f32 %f4791, %f1007, %f4788; neg.f32 %f4792, %f4791; fma.rn.f32 %f4793, %f1007, %f4788, %f4792; fma.rn.f32 %f4794, %f1007, %f4790, %f4793; fma.rn.f32 %f4796, %f4543, %f4788, %f4794; add.rn.f32 %f4797, %f4791, %f4796; neg.f32 %f4798, %f4797; add.rn.f32 %f4799, %f4791, %f4798; add.rn.f32 %f4800, %f4799, %f4796; mov.b32 %r1611, %f4797; setp.eq.s32 %p767, %r1611, 1118925336; add.s32 %r1612, %r1611, -1; mov.b32 %f4801, %r1612; add.f32 %f4802, %f4800, 0f37000000; selp.f32 %f900, %f4802, %f4800, %p767; selp.f32 %f4803, %f4801, %f4797, %p767; mul.rn.f32 %f4805, %f4803, %f5271; cvt.rzi.f32.f32 %f4806, %f4805; abs.f32 %f4807, %f4806; setp.gt.f32 %p768, %f4807, 0f42FC0000; mov.b32 %r1613, %f4806; and.b32 %r1614, %r1613, -2147483648; or.b32 %r1615, %r1614, 1123811328; mov.b32 %f4808, %r1615; selp.f32 %f4809, %f4808, %f4806, %p768; fma.rn.f32 %f4811, %f4809, %f5272, %f4803; fma.rn.f32 %f4813, %f4809, %f5273, %f4811; mul.f32 %f4814, %f4813, 0f3FB8AA3B; add.f32 %f4815, %f4809, 0f4B40007F; mov.b32 %r1616, %f4815; shl.b32 %r1617, %r1616, 23; mov.b32 %f4816, %r1617; ex2.approx.ftz.f32 %f4817, %f4814; mul.f32 %f901, %f4817, %f4816; setp.eq.f32 %p769, %f901, 0f7F800000; mov.f32 %f5626, 0f7F800000; @%p769 bra $L__BB0_573; fma.rn.f32 %f5626, %f901, %f900, %f901; $L__BB0_573: setp.lt.f32 %p770, %f898, 0f00000000; and.pred %p21, %p770, %p55; setp.eq.f32 %p772, %f898, 0f00000000; @%p772 bra $L__BB0_577; bra.uni $L__BB0_574; $L__BB0_577: add.f32 %f4822, %f898, %f898; selp.f32 %f5628, %f4822, 0f00000000, %p55; bra.uni $L__BB0_578; $L__BB0_574: mov.b32 %r1618, %f5626; xor.b32 %r1619, %r1618, -2147483648; mov.b32 %f4818, %r1619; selp.f32 %f5628, %f4818, %f5626, %p21; setp.geu.f32 %p773, %f898, 0f00000000; @%p773 bra $L__BB0_578; cvt.rzi.f32.f32 %f4820, %f1007; setp.eq.f32 %p774, %f4820, 0f40000000; @%p774 bra $L__BB0_578; mov.f32 %f5628, 0f7FFFFFFF; $L__BB0_578: add.f32 %f4823, %f899, 0f40000000; mov.b32 %r1620, %f4823; setp.lt.s32 %p776, %r1620, 2139095040; @%p776 bra $L__BB0_583; setp.gtu.f32 %p777, %f899, 0f7F800000; @%p777 bra $L__BB0_582; bra.uni $L__BB0_580; $L__BB0_582: add.f32 %f5628, %f898, 0f40000000; bra.uni $L__BB0_583; $L__BB0_580: setp.neu.f32 %p778, %f899, 0f7F800000; @%p778 bra $L__BB0_583; selp.f32 %f5628, 0fFF800000, 0f7F800000, %p21; $L__BB0_583: mov.f32 %f5281, 0f3102E308; mov.f32 %f5280, 0fBF317218; mov.f32 %f5279, 0f3FB8AA3B; mov.f32 %f5278, 0f35BFBE8E; mov.f32 %f5277, 0f3F317200; mov.f32 %f5276, 0f3DAAAABD; mov.f32 %f5275, 0f3C4CAF63; mov.f32 %f5274, 0f3B18F0FE; mul.f32 %f4827, %f5628, 0f3F000000; setp.eq.f32 %p779, %f898, 0f3F800000; selp.f32 %f910, 0f3F000000, %f4827, %p779; add.f32 %f911, %f897, 0fBF800000; abs.f32 %f912, %f911; setp.lt.f32 %p780, %f912, 0f00800000; mul.f32 %f4828, %f912, 0f4B800000; selp.f32 %f4829, %f4828, %f912, %p780; selp.f32 %f4830, 0fC3170000, 0fC2FE0000, %p780; mov.b32 %r1621, %f4829; and.b32 %r1622, %r1621, 8388607; or.b32 %r1623, %r1622, 1065353216; mov.b32 %f4831, %r1623; shr.u32 %r1624, %r1621, 23; cvt.rn.f32.u32 %f4832, %r1624; add.f32 %f4833, %f4830, %f4832; setp.gt.f32 %p781, %f4831, 0f3FB504F3; mul.f32 %f4834, %f4831, 0f3F000000; add.f32 %f4835, %f4833, 0f3F800000; selp.f32 %f4836, %f4835, %f4833, %p781; selp.f32 %f4837, %f4834, %f4831, %p781; add.f32 %f4838, %f4837, 0fBF800000; add.f32 %f4825, %f4837, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f4824,%f4825; // end inline asm add.f32 %f4839, %f4838, %f4838; mul.f32 %f4841, %f4824, %f4839; mul.f32 %f4842, %f4841, %f4841; fma.rn.f32 %f4845, %f5274, %f4842, %f5275; fma.rn.f32 %f4847, %f4845, %f4842, %f5276; mul.rn.f32 %f4848, %f4847, %f4842; mul.rn.f32 %f4849, %f4848, %f4841; sub.f32 %f4850, %f4838, %f4841; add.f32 %f4851, %f4850, %f4850; neg.f32 %f4852, %f4841; fma.rn.f32 %f4853, %f4852, %f4838, %f4851; mul.rn.f32 %f4854, %f4824, %f4853; add.f32 %f4855, %f4849, %f4841; sub.f32 %f4856, %f4841, %f4855; add.f32 %f4857, %f4849, %f4856; add.f32 %f4858, %f4854, %f4857; add.f32 %f4859, %f4855, %f4858; sub.f32 %f4860, %f4855, %f4859; add.f32 %f4861, %f4858, %f4860; mul.rn.f32 %f4863, %f4836, %f5277; mul.rn.f32 %f4865, %f4836, %f5278; add.f32 %f4866, %f4863, %f4859; sub.f32 %f4867, %f4863, %f4866; add.f32 %f4868, %f4859, %f4867; add.f32 %f4869, %f4861, %f4868; add.f32 %f4870, %f4865, %f4869; add.f32 %f4871, %f4866, %f4870; sub.f32 %f4872, %f4866, %f4871; add.f32 %f4873, %f4870, %f4872; mul.rn.f32 %f4874, %f1007, %f4871; neg.f32 %f4875, %f4874; fma.rn.f32 %f4876, %f1007, %f4871, %f4875; fma.rn.f32 %f4877, %f1007, %f4873, %f4876; fma.rn.f32 %f4879, %f4543, %f4871, %f4877; add.rn.f32 %f4880, %f4874, %f4879; neg.f32 %f4881, %f4880; add.rn.f32 %f4882, %f4874, %f4881; add.rn.f32 %f4883, %f4882, %f4879; mov.b32 %r1625, %f4880; setp.eq.s32 %p782, %r1625, 1118925336; add.s32 %r1626, %r1625, -1; mov.b32 %f4884, %r1626; add.f32 %f4885, %f4883, 0f37000000; selp.f32 %f913, %f4885, %f4883, %p782; selp.f32 %f4886, %f4884, %f4880, %p782; mul.rn.f32 %f4888, %f4886, %f5279; cvt.rzi.f32.f32 %f4889, %f4888; abs.f32 %f4890, %f4889; setp.gt.f32 %p783, %f4890, 0f42FC0000; mov.b32 %r1627, %f4889; and.b32 %r1628, %r1627, -2147483648; or.b32 %r1629, %r1628, 1123811328; mov.b32 %f4891, %r1629; selp.f32 %f4892, %f4891, %f4889, %p783; fma.rn.f32 %f4894, %f4892, %f5280, %f4886; fma.rn.f32 %f4896, %f4892, %f5281, %f4894; mul.f32 %f4897, %f4896, 0f3FB8AA3B; add.f32 %f4898, %f4892, 0f4B40007F; mov.b32 %r1630, %f4898; shl.b32 %r1631, %r1630, 23; mov.b32 %f4899, %r1631; ex2.approx.ftz.f32 %f4900, %f4897; mul.f32 %f914, %f4900, %f4899; setp.eq.f32 %p784, %f914, 0f7F800000; mov.f32 %f5629, 0f7F800000; @%p784 bra $L__BB0_585; fma.rn.f32 %f5629, %f914, %f913, %f914; $L__BB0_585: setp.lt.f32 %p785, %f911, 0f00000000; and.pred %p22, %p785, %p55; setp.eq.f32 %p787, %f911, 0f00000000; @%p787 bra $L__BB0_589; bra.uni $L__BB0_586; $L__BB0_589: add.f32 %f4905, %f911, %f911; selp.f32 %f5631, %f4905, 0f00000000, %p55; bra.uni $L__BB0_590; $L__BB0_586: mov.b32 %r1632, %f5629; xor.b32 %r1633, %r1632, -2147483648; mov.b32 %f4901, %r1633; selp.f32 %f5631, %f4901, %f5629, %p22; setp.geu.f32 %p788, %f911, 0f00000000; @%p788 bra $L__BB0_590; cvt.rzi.f32.f32 %f4903, %f1007; setp.eq.f32 %p789, %f4903, 0f40000000; @%p789 bra $L__BB0_590; mov.f32 %f5631, 0f7FFFFFFF; $L__BB0_590: add.f32 %f4906, %f912, 0f40000000; mov.b32 %r1634, %f4906; setp.lt.s32 %p791, %r1634, 2139095040; @%p791 bra $L__BB0_595; setp.gtu.f32 %p792, %f912, 0f7F800000; @%p792 bra $L__BB0_594; bra.uni $L__BB0_592; $L__BB0_594: add.f32 %f5631, %f911, 0f40000000; bra.uni $L__BB0_595; $L__BB0_592: setp.neu.f32 %p793, %f912, 0f7F800000; @%p793 bra $L__BB0_595; selp.f32 %f5631, 0fFF800000, 0f7F800000, %p22; $L__BB0_595: mov.f32 %f5289, 0f3102E308; mov.f32 %f5288, 0fBF317218; mov.f32 %f5287, 0f3FB8AA3B; mov.f32 %f5286, 0f35BFBE8E; mov.f32 %f5285, 0f3F317200; mov.f32 %f5284, 0f3DAAAABD; mov.f32 %f5283, 0f3C4CAF63; mov.f32 %f5282, 0f3B18F0FE; mov.f32 %f5177, 0f3F400000; sub.f32 %f4911, %f5177, %f5631; setp.eq.f32 %p794, %f911, 0f3F800000; selp.f32 %f923, 0fBE800000, %f4911, %p794; add.f32 %f924, %f897, 0fBF000000; abs.f32 %f925, %f924; setp.lt.f32 %p795, %f925, 0f00800000; mul.f32 %f4912, %f925, 0f4B800000; selp.f32 %f4913, %f4912, %f925, %p795; selp.f32 %f4914, 0fC3170000, 0fC2FE0000, %p795; mov.b32 %r1635, %f4913; and.b32 %r1636, %r1635, 8388607; or.b32 %r1637, %r1636, 1065353216; mov.b32 %f4915, %r1637; shr.u32 %r1638, %r1635, 23; cvt.rn.f32.u32 %f4916, %r1638; add.f32 %f4917, %f4914, %f4916; setp.gt.f32 %p796, %f4915, 0f3FB504F3; mul.f32 %f4918, %f4915, 0f3F000000; add.f32 %f4919, %f4917, 0f3F800000; selp.f32 %f4920, %f4919, %f4917, %p796; selp.f32 %f4921, %f4918, %f4915, %p796; add.f32 %f4922, %f4921, 0fBF800000; add.f32 %f4908, %f4921, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f4907,%f4908; // end inline asm add.f32 %f4923, %f4922, %f4922; mul.f32 %f4925, %f4907, %f4923; mul.f32 %f4926, %f4925, %f4925; fma.rn.f32 %f4929, %f5282, %f4926, %f5283; fma.rn.f32 %f4931, %f4929, %f4926, %f5284; mul.rn.f32 %f4932, %f4931, %f4926; mul.rn.f32 %f4933, %f4932, %f4925; sub.f32 %f4934, %f4922, %f4925; add.f32 %f4935, %f4934, %f4934; neg.f32 %f4936, %f4925; fma.rn.f32 %f4937, %f4936, %f4922, %f4935; mul.rn.f32 %f4938, %f4907, %f4937; add.f32 %f4939, %f4933, %f4925; sub.f32 %f4940, %f4925, %f4939; add.f32 %f4941, %f4933, %f4940; add.f32 %f4942, %f4938, %f4941; add.f32 %f4943, %f4939, %f4942; sub.f32 %f4944, %f4939, %f4943; add.f32 %f4945, %f4942, %f4944; mul.rn.f32 %f4947, %f4920, %f5285; mul.rn.f32 %f4949, %f4920, %f5286; add.f32 %f4950, %f4947, %f4943; sub.f32 %f4951, %f4947, %f4950; add.f32 %f4952, %f4943, %f4951; add.f32 %f4953, %f4945, %f4952; add.f32 %f4954, %f4949, %f4953; add.f32 %f4955, %f4950, %f4954; sub.f32 %f4956, %f4950, %f4955; add.f32 %f4957, %f4954, %f4956; mul.rn.f32 %f4958, %f1007, %f4955; neg.f32 %f4959, %f4958; fma.rn.f32 %f4960, %f1007, %f4955, %f4959; fma.rn.f32 %f4961, %f1007, %f4957, %f4960; fma.rn.f32 %f4963, %f4543, %f4955, %f4961; add.rn.f32 %f4964, %f4958, %f4963; neg.f32 %f4965, %f4964; add.rn.f32 %f4966, %f4958, %f4965; add.rn.f32 %f4967, %f4966, %f4963; mov.b32 %r1639, %f4964; setp.eq.s32 %p797, %r1639, 1118925336; add.s32 %r1640, %r1639, -1; mov.b32 %f4968, %r1640; add.f32 %f4969, %f4967, 0f37000000; selp.f32 %f926, %f4969, %f4967, %p797; selp.f32 %f4970, %f4968, %f4964, %p797; mul.rn.f32 %f4972, %f4970, %f5287; cvt.rzi.f32.f32 %f4973, %f4972; abs.f32 %f4974, %f4973; setp.gt.f32 %p798, %f4974, 0f42FC0000; mov.b32 %r1641, %f4973; and.b32 %r1642, %r1641, -2147483648; or.b32 %r1643, %r1642, 1123811328; mov.b32 %f4975, %r1643; selp.f32 %f4976, %f4975, %f4973, %p798; fma.rn.f32 %f4978, %f4976, %f5288, %f4970; fma.rn.f32 %f4980, %f4976, %f5289, %f4978; mul.f32 %f4981, %f4980, 0f3FB8AA3B; add.f32 %f4982, %f4976, 0f4B40007F; mov.b32 %r1644, %f4982; shl.b32 %r1645, %r1644, 23; mov.b32 %f4983, %r1645; ex2.approx.ftz.f32 %f4984, %f4981; mul.f32 %f927, %f4984, %f4983; setp.eq.f32 %p799, %f927, 0f7F800000; mov.f32 %f5632, 0f7F800000; @%p799 bra $L__BB0_597; fma.rn.f32 %f5632, %f927, %f926, %f927; $L__BB0_597: setp.lt.f32 %p800, %f924, 0f00000000; and.pred %p23, %p800, %p55; setp.eq.f32 %p802, %f924, 0f00000000; @%p802 bra $L__BB0_601; bra.uni $L__BB0_598; $L__BB0_601: add.f32 %f4989, %f924, %f924; selp.f32 %f5634, %f4989, 0f00000000, %p55; bra.uni $L__BB0_602; $L__BB0_598: mov.b32 %r1646, %f5632; xor.b32 %r1647, %r1646, -2147483648; mov.b32 %f4985, %r1647; selp.f32 %f5634, %f4985, %f5632, %p23; setp.geu.f32 %p803, %f924, 0f00000000; @%p803 bra $L__BB0_602; cvt.rzi.f32.f32 %f4987, %f1007; setp.eq.f32 %p804, %f4987, 0f40000000; @%p804 bra $L__BB0_602; mov.f32 %f5634, 0f7FFFFFFF; $L__BB0_602: add.f32 %f4990, %f925, 0f40000000; mov.b32 %r1648, %f4990; setp.lt.s32 %p806, %r1648, 2139095040; @%p806 bra $L__BB0_607; setp.gtu.f32 %p807, %f925, 0f7F800000; @%p807 bra $L__BB0_606; bra.uni $L__BB0_604; $L__BB0_606: add.f32 %f5634, %f924, 0f40000000; bra.uni $L__BB0_607; $L__BB0_604: setp.neu.f32 %p808, %f925, 0f7F800000; @%p808 bra $L__BB0_607; selp.f32 %f5634, 0fFF800000, 0f7F800000, %p23; $L__BB0_607: mov.u64 %rd1233, _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E; ld.param.f32 %f5180, [g2p2g_param_11]; add.f32 %f5179, %f5180, %f5180; ld.param.f32 %f5178, [g2p2g_param_0]; mul.f32 %f4991, %f5634, 0f3F000000; setp.eq.f32 %p809, %f924, 0f3F800000; selp.f32 %f936, 0f3F000000, %f4991, %p809; mul.f32 %f4992, %f4, %f5533; mul.f32 %f4993, %f4, %f5534; mul.f32 %f4994, %f4, %f5535; mul.f32 %f4995, %f4, %f5536; mul.f32 %f4996, %f12, %f5; mul.f32 %f4997, %f4996, %f5178; mul.f32 %f4998, %f4997, %f5613; mul.f32 %f4999, %f4997, %f5614; mul.f32 %f5000, %f4997, %f5615; mul.f32 %f5001, %f4997, %f5616; sub.f32 %f937, %f4992, %f4998; sub.f32 %f938, %f4993, %f4999; sub.f32 %f939, %f4994, %f5000; sub.f32 %f940, %f4995, %f5001; add.u64 %rd1000, %SPL, 96; ld.local.u64 %rd1001, [%rd1000]; cvt.u32.u64 %r1649, %rd1001; mov.b32 %f5002, %r1649; mul.f32 %f5003, %f4, %f5002; shr.u64 %rd1002, %rd1001, 32; cvt.u32.u64 %r1650, %rd1002; mov.b32 %f5004, %r1650; mul.f32 %f5005, %f4, %f5004; fma.rn.f32 %f941, %f5178, 0f00000000, %f5003; fma.rn.f32 %f942, %f5178, 0f00000000, %f5005; setp.gt.f32 %p810, %f681, 0f00000000; selp.f32 %f943, %f4, 0f00000000, %p810; mul.f32 %f944, %f495, %f943; sub.f32 %f5006, %f853, %f13; setp.gt.f32 %p811, %f5006, 0f5EFFFFFF; max.f32 %f5007, %f5006, 0fDF000000; cvt.rzi.s64.f32 %rd1003, %f5007; selp.b64 %rd1004, 4294967295, %rd1003, %p811; setp.num.f32 %p812, %f5006, %f5006; selp.b64 %rd1005, %rd1004, 0, %p812; sub.f32 %f5008, %f854, %f14; setp.gt.f32 %p813, %f5008, 0f5EFFFFFF; max.f32 %f5009, %f5008, 0fDF000000; cvt.rzi.s64.f32 %rd1006, %f5009; setp.num.f32 %p814, %f5008, %f5008; add.s64 %rd1007, %rd1005, %rd66; shl.b64 %rd1008, %rd1006, 3; selp.b64 %rd1009, 4294967288, %rd1008, %p813; selp.b64 %rd1010, %rd1009, 0, %p814; add.s64 %rd1011, %rd1007, %rd1010; and.b64 %rd271, %rd1011, 4294967295; add.f32 %f945, %f855, %f5179; mul.f32 %f946, %f937, %f945; add.f32 %f947, %f856, %f5179; mul.f32 %f948, %f939, %f947; add.f32 %f5010, %f946, %f948; add.f32 %f949, %f941, %f5010; shl.b64 %rd1012, %rd1011, 6; and.b64 %rd1013, %rd1012, 274877906880; cvta.shared.u64 %rd1015, %rd1233; add.s64 %rd1016, %rd1015, %rd1013; add.s64 %rd272, %rd1016, 1212; mov.b32 %r1794, %f495; $L__BB0_608: // begin inline asm cvta.to.shared.u64 %rd1017, %rd272;atom.acquire.shared.exch.b32 %r1651, [%rd1017], %r1; // end inline asm setp.ne.s32 %p815, %r1651, -1; @%p815 bra $L__BB0_608; ld.param.f32 %f5216, [g2p2g_param_11]; mul.f32 %f5215, %f5216, 0f00000000; mov.u64 %rd1234, _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E; mul.f32 %f5011, %f896, %f936; mul.f32 %f950, %f938, %f945; mul.f32 %f951, %f940, %f947; add.f32 %f5012, %f950, %f951; add.f32 %f5013, %f942, %f5012; shl.b64 %rd1021, %rd271, 6; add.s64 %rd1023, %rd1234, %rd1021; add.s64 %rd273, %rd1023, 1172; ld.shared.f32 %f5014, [%rd1023+1172]; fma.rn.f32 %f5015, %f4, %f5011, %f5014; st.shared.f32 [%rd1023+1172], %f5015; ld.shared.v2.f32 {%f5016, %f5017}, [%rd1023+1176]; fma.rn.f32 %f5020, %f949, %f5011, %f5016; st.shared.f32 [%rd1023+1176], %f5020; fma.rn.f32 %f5021, %f5013, %f5011, %f5017; st.shared.f32 [%rd1023+1180], %f5021; ld.shared.v2.f32 {%f5022, %f5023}, [%rd1023+1192]; fma.rn.f32 %f5026, %f944, %f5011, %f5023; fma.rn.f32 %f5027, %f943, %f5011, %f5022; st.shared.v2.f32 [%rd1023+1192], {%f5027, %f5026}; mov.u32 %r1654, -1; // begin inline asm cvta.to.shared.u64 %rd1019, %rd272;atom.release.shared.exch.b32 %r1653, [%rd1019], %r1654; // end inline asm add.f32 %f952, %f856, %f5215; mul.f32 %f953, %f939, %f952; add.f32 %f5028, %f946, %f953; add.f32 %f954, %f941, %f5028; add.s64 %rd1025, %rd1015, %rd1021; add.s64 %rd274, %rd1025, 188; $L__BB0_610: // begin inline asm cvta.to.shared.u64 %rd1026, %rd274;atom.acquire.shared.exch.b32 %r1655, [%rd1026], %r1; // end inline asm setp.ne.s32 %p816, %r1655, -1; @%p816 bra $L__BB0_610; ld.param.f32 %f5217, [g2p2g_param_11]; mul.f32 %f5029, %f896, %f910; mul.f32 %f955, %f940, %f952; add.f32 %f5030, %f950, %f955; add.f32 %f5031, %f942, %f5030; ld.shared.f32 %f5032, [%rd273+-1024]; fma.rn.f32 %f5033, %f4, %f5029, %f5032; st.shared.f32 [%rd273+-1024], %f5033; ld.shared.v2.f32 {%f5034, %f5035}, [%rd273+-1020]; fma.rn.f32 %f5038, %f954, %f5029, %f5034; st.shared.f32 [%rd273+-1020], %f5038; fma.rn.f32 %f5039, %f5031, %f5029, %f5035; st.shared.f32 [%rd273+-1016], %f5039; ld.shared.v2.f32 {%f5040, %f5041}, [%rd273+-1004]; fma.rn.f32 %f5044, %f944, %f5029, %f5041; fma.rn.f32 %f5045, %f943, %f5029, %f5040; st.shared.v2.f32 [%rd273+-1004], {%f5045, %f5044}; // begin inline asm cvta.to.shared.u64 %rd1028, %rd274;atom.release.shared.exch.b32 %r1657, [%rd1028], %r1654; // end inline asm add.f32 %f956, %f856, %f5217; mul.f32 %f957, %f939, %f956; add.f32 %f5046, %f946, %f957; add.f32 %f958, %f941, %f5046; add.s64 %rd275, %rd1025, 700; $L__BB0_612: // begin inline asm cvta.to.shared.u64 %rd1034, %rd275;atom.acquire.shared.exch.b32 %r1659, [%rd1034], %r1; // end inline asm setp.ne.s32 %p817, %r1659, -1; @%p817 bra $L__BB0_612; ld.param.f32 %f5219, [g2p2g_param_11]; mul.f32 %f5218, %f5219, 0f00000000; mul.f32 %f5047, %f896, %f923; mul.f32 %f959, %f940, %f956; add.f32 %f5048, %f950, %f959; add.f32 %f5049, %f942, %f5048; ld.shared.f32 %f5050, [%rd273+-512]; fma.rn.f32 %f5051, %f4, %f5047, %f5050; st.shared.f32 [%rd273+-512], %f5051; ld.shared.v2.f32 {%f5052, %f5053}, [%rd273+-508]; fma.rn.f32 %f5056, %f958, %f5047, %f5052; st.shared.f32 [%rd273+-508], %f5056; fma.rn.f32 %f5057, %f5049, %f5047, %f5053; st.shared.f32 [%rd273+-504], %f5057; ld.shared.v2.f32 {%f5058, %f5059}, [%rd273+-492]; fma.rn.f32 %f5062, %f944, %f5047, %f5059; fma.rn.f32 %f5063, %f943, %f5047, %f5058; st.shared.v2.f32 [%rd273+-492], {%f5063, %f5062}; // begin inline asm cvta.to.shared.u64 %rd1036, %rd275;atom.release.shared.exch.b32 %r1661, [%rd1036], %r1654; // end inline asm add.f32 %f960, %f855, %f5218; mul.f32 %f961, %f937, %f960; add.f32 %f5064, %f961, %f948; add.f32 %f962, %f941, %f5064; add.s64 %rd276, %rd1025, 1084; $L__BB0_614: // begin inline asm cvta.to.shared.u64 %rd1042, %rd276;atom.acquire.shared.exch.b32 %r1663, [%rd1042], %r1; // end inline asm setp.ne.s32 %p818, %r1663, -1; @%p818 bra $L__BB0_614; mul.f32 %f5065, %f870, %f936; mul.f32 %f963, %f938, %f960; add.f32 %f5066, %f963, %f951; add.f32 %f5067, %f942, %f5066; ld.shared.f32 %f5068, [%rd273+-128]; fma.rn.f32 %f5069, %f4, %f5065, %f5068; st.shared.f32 [%rd273+-128], %f5069; ld.shared.v2.f32 {%f5070, %f5071}, [%rd273+-124]; fma.rn.f32 %f5074, %f962, %f5065, %f5070; st.shared.f32 [%rd273+-124], %f5074; fma.rn.f32 %f5075, %f5067, %f5065, %f5071; st.shared.f32 [%rd273+-120], %f5075; ld.shared.v2.f32 {%f5076, %f5077}, [%rd273+-108]; fma.rn.f32 %f5080, %f944, %f5065, %f5077; fma.rn.f32 %f5081, %f943, %f5065, %f5076; st.shared.v2.f32 [%rd273+-108], {%f5081, %f5080}; // begin inline asm cvta.to.shared.u64 %rd1044, %rd276;atom.release.shared.exch.b32 %r1665, [%rd1044], %r1654; // end inline asm add.f32 %f5082, %f961, %f953; add.f32 %f964, %f941, %f5082; add.s64 %rd277, %rd1025, 60; $L__BB0_616: // begin inline asm cvta.to.shared.u64 %rd1050, %rd277;atom.acquire.shared.exch.b32 %r1667, [%rd1050], %r1; // end inline asm setp.ne.s32 %p819, %r1667, -1; @%p819 bra $L__BB0_616; mul.f32 %f5083, %f870, %f910; add.f32 %f5084, %f963, %f955; add.f32 %f5085, %f942, %f5084; ld.shared.f32 %f5086, [%rd273+-1152]; fma.rn.f32 %f5087, %f4, %f5083, %f5086; st.shared.f32 [%rd273+-1152], %f5087; ld.shared.v2.f32 {%f5088, %f5089}, [%rd273+-1148]; fma.rn.f32 %f5092, %f964, %f5083, %f5088; st.shared.f32 [%rd273+-1148], %f5092; fma.rn.f32 %f5093, %f5085, %f5083, %f5089; st.shared.f32 [%rd273+-1144], %f5093; ld.shared.v2.f32 {%f5094, %f5095}, [%rd273+-1132]; fma.rn.f32 %f5098, %f944, %f5083, %f5095; fma.rn.f32 %f5099, %f943, %f5083, %f5094; st.shared.v2.f32 [%rd273+-1132], {%f5099, %f5098}; // begin inline asm cvta.to.shared.u64 %rd1052, %rd277;atom.release.shared.exch.b32 %r1669, [%rd1052], %r1654; // end inline asm add.f32 %f5100, %f961, %f957; add.f32 %f965, %f941, %f5100; add.s64 %rd278, %rd277, 512; $L__BB0_618: // begin inline asm cvta.to.shared.u64 %rd1054, %rd278;atom.acquire.shared.exch.b32 %r1671, [%rd1054], %r1; // end inline asm setp.ne.s32 %p820, %r1671, -1; @%p820 bra $L__BB0_618; ld.param.f32 %f5220, [g2p2g_param_11]; mul.f32 %f5101, %f870, %f923; add.f32 %f5102, %f963, %f959; add.f32 %f5103, %f942, %f5102; ld.shared.f32 %f5104, [%rd273+-640]; fma.rn.f32 %f5105, %f4, %f5101, %f5104; st.shared.f32 [%rd273+-640], %f5105; ld.shared.v2.f32 {%f5106, %f5107}, [%rd273+-636]; fma.rn.f32 %f5110, %f965, %f5101, %f5106; st.shared.f32 [%rd273+-636], %f5110; fma.rn.f32 %f5111, %f5103, %f5101, %f5107; st.shared.f32 [%rd273+-632], %f5111; ld.shared.v2.f32 {%f5112, %f5113}, [%rd273+-620]; fma.rn.f32 %f5116, %f944, %f5101, %f5113; fma.rn.f32 %f5117, %f943, %f5101, %f5112; st.shared.v2.f32 [%rd273+-620], {%f5117, %f5116}; // begin inline asm cvta.to.shared.u64 %rd1056, %rd278;atom.release.shared.exch.b32 %r1673, [%rd1056], %r1654; // end inline asm add.f32 %f966, %f855, %f5220; mul.f32 %f967, %f937, %f966; add.f32 %f5118, %f967, %f948; add.f32 %f968, %f941, %f5118; add.s64 %rd279, %rd277, 1088; $L__BB0_620: // begin inline asm cvta.to.shared.u64 %rd1058, %rd279;atom.acquire.shared.exch.b32 %r1675, [%rd1058], %r1; // end inline asm setp.ne.s32 %p821, %r1675, -1; @%p821 bra $L__BB0_620; mul.f32 %f5119, %f883, %f936; mul.f32 %f969, %f938, %f966; add.f32 %f5120, %f969, %f951; add.f32 %f5121, %f942, %f5120; ld.shared.f32 %f5122, [%rd273+-64]; fma.rn.f32 %f5123, %f4, %f5119, %f5122; st.shared.f32 [%rd273+-64], %f5123; ld.shared.v2.f32 {%f5124, %f5125}, [%rd273+-60]; fma.rn.f32 %f5128, %f968, %f5119, %f5124; st.shared.f32 [%rd273+-60], %f5128; fma.rn.f32 %f5129, %f5121, %f5119, %f5125; st.shared.f32 [%rd273+-56], %f5129; ld.shared.v2.f32 {%f5130, %f5131}, [%rd273+-44]; fma.rn.f32 %f5134, %f944, %f5119, %f5131; fma.rn.f32 %f5135, %f943, %f5119, %f5130; st.shared.v2.f32 [%rd273+-44], {%f5135, %f5134}; // begin inline asm cvta.to.shared.u64 %rd1060, %rd279;atom.release.shared.exch.b32 %r1677, [%rd1060], %r1654; // end inline asm add.f32 %f5136, %f967, %f953; add.f32 %f970, %f941, %f5136; add.s64 %rd280, %rd277, 64; $L__BB0_622: // begin inline asm cvta.to.shared.u64 %rd1062, %rd280;atom.acquire.shared.exch.b32 %r1679, [%rd1062], %r1; // end inline asm setp.ne.s32 %p822, %r1679, -1; @%p822 bra $L__BB0_622; mul.f32 %f5137, %f883, %f910; add.f32 %f5138, %f969, %f955; add.f32 %f5139, %f942, %f5138; ld.shared.f32 %f5140, [%rd273+-1088]; fma.rn.f32 %f5141, %f4, %f5137, %f5140; st.shared.f32 [%rd273+-1088], %f5141; ld.shared.v2.f32 {%f5142, %f5143}, [%rd273+-1084]; fma.rn.f32 %f5146, %f970, %f5137, %f5142; st.shared.f32 [%rd273+-1084], %f5146; fma.rn.f32 %f5147, %f5139, %f5137, %f5143; st.shared.f32 [%rd273+-1080], %f5147; ld.shared.v2.f32 {%f5148, %f5149}, [%rd273+-1068]; fma.rn.f32 %f5152, %f944, %f5137, %f5149; fma.rn.f32 %f5153, %f943, %f5137, %f5148; st.shared.v2.f32 [%rd273+-1068], {%f5153, %f5152}; // begin inline asm cvta.to.shared.u64 %rd1064, %rd280;atom.release.shared.exch.b32 %r1681, [%rd1064], %r1654; // end inline asm add.f32 %f5154, %f967, %f957; add.f32 %f971, %f941, %f5154; add.s64 %rd281, %rd277, 576; $L__BB0_624: // begin inline asm cvta.to.shared.u64 %rd1066, %rd281;atom.acquire.shared.exch.b32 %r1683, [%rd1066], %r1; // end inline asm setp.ne.s32 %p823, %r1683, -1; @%p823 bra $L__BB0_624; mul.f32 %f5155, %f883, %f923; add.f32 %f5156, %f969, %f959; add.f32 %f5157, %f942, %f5156; ld.shared.f32 %f5158, [%rd273+-576]; fma.rn.f32 %f5159, %f4, %f5155, %f5158; st.shared.f32 [%rd273+-576], %f5159; ld.shared.v2.f32 {%f5160, %f5161}, [%rd273+-572]; fma.rn.f32 %f5164, %f971, %f5155, %f5160; st.shared.f32 [%rd273+-572], %f5164; fma.rn.f32 %f5165, %f5157, %f5155, %f5161; st.shared.f32 [%rd273+-568], %f5165; ld.shared.v2.f32 {%f5166, %f5167}, [%rd273+-556]; fma.rn.f32 %f5170, %f944, %f5155, %f5167; fma.rn.f32 %f5171, %f943, %f5155, %f5166; st.shared.v2.f32 [%rd273+-556], {%f5171, %f5170}; // begin inline asm cvta.to.shared.u64 %rd1068, %rd281;atom.release.shared.exch.b32 %r1685, [%rd1068], %r1654; // end inline asm mov.u16 %rs35, 0; $L__BB0_627: mul.wide.u32 %rd1252, %r8, 8; ld.param.u64 %rd1251, [g2p2g_param_7]; cvta.to.global.u64 %rd1250, %rd1251; add.s64 %rd1249, %rd1250, %rd1252; ld.param.u64 %rd1248, [g2p2g_param_6]; mul.wide.u32 %rd1247, %r8, 32; cvta.to.global.u64 %rd1246, %rd1248; add.s64 %rd1245, %rd1246, %rd1247; ld.param.u64 %rd1244, [g2p2g_param_5]; cvta.to.global.u64 %rd1243, %rd1244; add.s64 %rd1242, %rd1243, %rd1252; ld.param.u64 %rd1241, [g2p2g_param_4]; cvta.to.global.u64 %rd1240, %rd1241; add.s64 %rd1239, %rd1240, %rd1252; ld.param.u64 %rd1238, [g2p2g_param_3]; mul.wide.u32 %rd1237, %r8, 24; cvta.to.global.u64 %rd1236, %rd1238; add.s64 %rd1235, %rd1236, %rd1237; st.global.v4.u8 [%rd1235], {%rs35, %rs8, %rs9, %rs10}; shr.u64 %rd1082, %rd64, 32; st.global.u32 [%rd1235+8], %rd1082; st.global.u32 [%rd1235+4], %rd64; st.global.u32 [%rd1235+12], %r9; st.global.u64 [%rd1235+16], %rd65; st.global.f32 [%rd1239], %f115; st.global.f32 [%rd1239+4], %f116; add.u64 %rd1087, %SPL, 96; ld.local.u64 %rd1088, [%rd1087]; st.global.u32 [%rd1242], %rd1088; shr.u64 %rd1091, %rd1088, 32; st.global.u32 [%rd1242+4], %rd1091; st.global.f32 [%rd1245], %f4; st.global.f32 [%rd1245+4], %f5; st.global.f32 [%rd1245+8], %f6; st.global.f32 [%rd1245+12], %f5546; st.global.f32 [%rd1245+16], %f5531; st.global.f32 [%rd1245+20], %f5530; st.global.f32 [%rd1245+24], %f5635; st.global.f32 [%rd1245+28], %f5532; st.global.u32 [%rd1249], %r10; st.global.u32 [%rd1249+4], %r1794; $L__BB0_628: shr.u64 %rd1260, %rd11, 16; xor.b64 %rd1259, %rd1260, %rd11; mul.lo.s64 %rd1258, %rd1259, 2246822507; shr.u64 %rd1257, %rd1258, 13; xor.b64 %rd1256, %rd1257, %rd1258; mul.lo.s64 %rd1255, %rd1256, 3266489909; shr.u64 %rd1254, %rd1255, 16; xor.b64 %rd1253, %rd1254, %rd1255; ld.param.u32 %r1721, [g2p2g_param_11+40]; bar.sync 0; cvt.u64.u32 %rd1097, %r1721; add.s64 %rd282, %rd1097, -1; and.b64 %rd1385, %rd1253, %rd282; shl.b64 %rd1098, %rd1385, 4; add.s64 %rd1099, %rd5, %rd1098; ld.global.u64 %rd284, [%rd1099]; setp.eq.s64 %p824, %rd284, %rd11; @%p824 bra $L__BB0_634; setp.eq.s64 %p825, %rd284, -1; @%p825 bra $L__BB0_633; $L__BB0_631: add.s64 %rd1100, %rd1385, 1; and.b64 %rd1385, %rd1100, %rd282; shl.b64 %rd1101, %rd1385, 4; add.s64 %rd1102, %rd5, %rd1101; ld.global.u64 %rd287, [%rd1102]; setp.eq.s64 %p826, %rd287, %rd11; @%p826 bra $L__BB0_634; setp.ne.s64 %p827, %rd287, -1; @%p827 bra $L__BB0_631; $L__BB0_633: trap; $L__BB0_634: cvt.u64.u32 %rd1275, %r3; mov.u32 %r1722, %ntid.x; cvt.u64.u32 %rd1103, %r1; mul.lo.s64 %rd1105, %rd1275, %rd1103; and.b64 %rd289, %rd1105, 15; add.s64 %rd290, %rd289, %rd1275; setp.gt.u32 %p828, %r1722, 64; @%p828 bra $L__BB0_651; mul.wide.u32 %rd1261, %r3, %r1; shl.b64 %rd1106, %rd1385, 4; add.s64 %rd1107, %rd5, %rd1106; shr.u64 %rd1109, %rd1261, 2; and.b64 %rd293, %rd1109, 4; shr.u64 %rd1110, %rd1261, 3; and.b64 %rd294, %rd1110, 4; ld.global.u32 %r1695, [%rd1107+8]; mul.wide.u32 %rd295, %r1695, 16; add.s64 %rd1111, %rd289, 1; max.u64 %rd296, %rd1111, %rd290; sub.s64 %rd1112, %rd296, %rd1261; and.b64 %rd1388, %rd1112, 3; setp.eq.s64 %p829, %rd1388, 0; mov.u64 %rd1394, %rd289; @%p829 bra $L__BB0_640; mov.u64 %rd1387, %rd289; $L__BB0_637: .pragma "nounroll"; add.s64 %rd1394, %rd1387, 1; bfe.u64 %rd1113, %rd1387, 2, 2; and.b64 %rd1114, %rd1387, 3; or.b64 %rd1115, %rd1114, %rd293; or.b64 %rd1116, %rd1113, %rd294; shl.b64 %rd1117, %rd1116, 3; or.b64 %rd301, %rd1115, %rd1117; or.b64 %rd1118, %rd1114, %rd295; and.b64 %rd1119, %rd1387, 12; or.b64 %rd302, %rd1118, %rd1119; setp.le.u64 %p830, %rd355, %rd302; @%p830 bra $L__BB0_639; mul.lo.s64 %rd1130, %rd302, 48; add.s64 %rd1121, %rd349, %rd1130; shl.b64 %rd1131, %rd301, 6; mov.u64 %rd1132, _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E; add.s64 %rd1133, %rd1132, %rd1131; ld.shared.u32 %r1696, [%rd1133+20]; // begin inline asm cvta.to.global.u64 %rd1120, %rd1121;red.global.add.f32 [%rd1120], %r1696; // end inline asm add.s64 %rd1123, %rd1121, 4; ld.shared.u64 %rd1134, [%rd1133+24]; cvt.u32.u64 %r1697, %rd1134; shr.u64 %rd1135, %rd1134, 32; cvt.u32.u64 %r1698, %rd1135; // begin inline asm cvta.to.global.u64 %rd1122, %rd1123;red.global.add.f32 [%rd1122], %r1697; // end inline asm add.s64 %rd1125, %rd1121, 8; // begin inline asm cvta.to.global.u64 %rd1124, %rd1125;red.global.add.f32 [%rd1124], %r1698; // end inline asm add.s64 %rd1127, %rd1121, 12; ld.shared.u32 %r1699, [%rd1133+44]; // begin inline asm cvta.to.global.u64 %rd1126, %rd1127;red.global.add.f32 [%rd1126], %r1699; // end inline asm add.s64 %rd1129, %rd1121, 16; ld.shared.u32 %r1700, [%rd1133+40]; // begin inline asm cvta.to.global.u64 %rd1128, %rd1129;red.global.add.f32 [%rd1128], %r1700; // end inline asm $L__BB0_639: add.s64 %rd1388, %rd1388, -1; setp.ne.s64 %p831, %rd1388, 0; mov.u64 %rd1387, %rd1394; @%p831 bra $L__BB0_637; $L__BB0_640: not.b64 %rd1136, %rd289; add.s64 %rd1137, %rd296, %rd1136; setp.lt.u64 %p832, %rd1137, 3; @%p832 bra $L__BB0_651; add.s64 %rd1138, %rd1394, 3; and.b64 %rd1139, %rd1138, 3; and.b64 %rd1140, %rd1394, 3; xor.b64 %rd1141, %rd1140, 2; add.s64 %rd1142, %rd1394, 1; and.b64 %rd1143, %rd1142, 3; or.b64 %rd305, %rd1140, %rd293; or.b64 %rd306, %rd1140, %rd295; or.b64 %rd307, %rd1143, %rd293; or.b64 %rd308, %rd1143, %rd295; or.b64 %rd309, %rd1141, %rd293; or.b64 %rd310, %rd1141, %rd295; or.b64 %rd311, %rd1139, %rd293; or.b64 %rd312, %rd1139, %rd295; shr.u64 %rd1393, %rd1138, 2; add.s64 %rd1144, %rd1394, 2; shr.u64 %rd1392, %rd1144, 2; shr.u64 %rd1391, %rd1394, 2; shr.u64 %rd1390, %rd1142, 2; $L__BB0_642: and.b64 %rd322, %rd1391, 3; shl.b64 %rd1145, %rd1391, 2; and.b64 %rd1146, %rd1145, 12; or.b64 %rd323, %rd306, %rd1146; setp.le.u64 %p833, %rd355, %rd323; @%p833 bra $L__BB0_644; mul.lo.s64 %rd1157, %rd323, 48; add.s64 %rd1148, %rd349, %rd1157; or.b64 %rd1158, %rd322, %rd294; shl.b64 %rd1159, %rd1158, 3; or.b64 %rd1160, %rd305, %rd1159; shl.b64 %rd1161, %rd1160, 6; mov.u64 %rd1162, _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E; add.s64 %rd1163, %rd1162, %rd1161; ld.shared.u32 %r1701, [%rd1163+20]; // begin inline asm cvta.to.global.u64 %rd1147, %rd1148;red.global.add.f32 [%rd1147], %r1701; // end inline asm add.s64 %rd1150, %rd1148, 4; ld.shared.u64 %rd1164, [%rd1163+24]; cvt.u32.u64 %r1702, %rd1164; shr.u64 %rd1165, %rd1164, 32; cvt.u32.u64 %r1703, %rd1165; // begin inline asm cvta.to.global.u64 %rd1149, %rd1150;red.global.add.f32 [%rd1149], %r1702; // end inline asm add.s64 %rd1152, %rd1148, 8; // begin inline asm cvta.to.global.u64 %rd1151, %rd1152;red.global.add.f32 [%rd1151], %r1703; // end inline asm add.s64 %rd1154, %rd1148, 12; ld.shared.u32 %r1704, [%rd1163+44]; // begin inline asm cvta.to.global.u64 %rd1153, %rd1154;red.global.add.f32 [%rd1153], %r1704; // end inline asm add.s64 %rd1156, %rd1148, 16; ld.shared.u32 %r1705, [%rd1163+40]; // begin inline asm cvta.to.global.u64 %rd1155, %rd1156;red.global.add.f32 [%rd1155], %r1705; // end inline asm $L__BB0_644: and.b64 %rd324, %rd1390, 3; shl.b64 %rd1166, %rd1390, 2; and.b64 %rd1167, %rd1166, 12; or.b64 %rd325, %rd308, %rd1167; setp.le.u64 %p834, %rd355, %rd325; @%p834 bra $L__BB0_646; mul.lo.s64 %rd1178, %rd325, 48; add.s64 %rd1169, %rd349, %rd1178; or.b64 %rd1179, %rd324, %rd294; shl.b64 %rd1180, %rd1179, 3; or.b64 %rd1181, %rd307, %rd1180; shl.b64 %rd1182, %rd1181, 6; mov.u64 %rd1183, _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E; add.s64 %rd1184, %rd1183, %rd1182; ld.shared.u32 %r1706, [%rd1184+20]; // begin inline asm cvta.to.global.u64 %rd1168, %rd1169;red.global.add.f32 [%rd1168], %r1706; // end inline asm add.s64 %rd1171, %rd1169, 4; ld.shared.u64 %rd1185, [%rd1184+24]; cvt.u32.u64 %r1707, %rd1185; shr.u64 %rd1186, %rd1185, 32; cvt.u32.u64 %r1708, %rd1186; // begin inline asm cvta.to.global.u64 %rd1170, %rd1171;red.global.add.f32 [%rd1170], %r1707; // end inline asm add.s64 %rd1173, %rd1169, 8; // begin inline asm cvta.to.global.u64 %rd1172, %rd1173;red.global.add.f32 [%rd1172], %r1708; // end inline asm add.s64 %rd1175, %rd1169, 12; ld.shared.u32 %r1709, [%rd1184+44]; // begin inline asm cvta.to.global.u64 %rd1174, %rd1175;red.global.add.f32 [%rd1174], %r1709; // end inline asm add.s64 %rd1177, %rd1169, 16; ld.shared.u32 %r1710, [%rd1184+40]; // begin inline asm cvta.to.global.u64 %rd1176, %rd1177;red.global.add.f32 [%rd1176], %r1710; // end inline asm $L__BB0_646: and.b64 %rd326, %rd1392, 3; shl.b64 %rd1187, %rd1392, 2; and.b64 %rd1188, %rd1187, 12; or.b64 %rd327, %rd310, %rd1188; setp.le.u64 %p835, %rd355, %rd327; @%p835 bra $L__BB0_648; mul.lo.s64 %rd1199, %rd327, 48; add.s64 %rd1190, %rd349, %rd1199; or.b64 %rd1200, %rd326, %rd294; shl.b64 %rd1201, %rd1200, 3; or.b64 %rd1202, %rd309, %rd1201; shl.b64 %rd1203, %rd1202, 6; mov.u64 %rd1204, _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E; add.s64 %rd1205, %rd1204, %rd1203; ld.shared.u32 %r1711, [%rd1205+20]; // begin inline asm cvta.to.global.u64 %rd1189, %rd1190;red.global.add.f32 [%rd1189], %r1711; // end inline asm add.s64 %rd1192, %rd1190, 4; ld.shared.u64 %rd1206, [%rd1205+24]; cvt.u32.u64 %r1712, %rd1206; shr.u64 %rd1207, %rd1206, 32; cvt.u32.u64 %r1713, %rd1207; // begin inline asm cvta.to.global.u64 %rd1191, %rd1192;red.global.add.f32 [%rd1191], %r1712; // end inline asm add.s64 %rd1194, %rd1190, 8; // begin inline asm cvta.to.global.u64 %rd1193, %rd1194;red.global.add.f32 [%rd1193], %r1713; // end inline asm add.s64 %rd1196, %rd1190, 12; ld.shared.u32 %r1714, [%rd1205+44]; // begin inline asm cvta.to.global.u64 %rd1195, %rd1196;red.global.add.f32 [%rd1195], %r1714; // end inline asm add.s64 %rd1198, %rd1190, 16; ld.shared.u32 %r1715, [%rd1205+40]; // begin inline asm cvta.to.global.u64 %rd1197, %rd1198;red.global.add.f32 [%rd1197], %r1715; // end inline asm $L__BB0_648: add.s64 %rd1394, %rd1394, 4; and.b64 %rd329, %rd1393, 3; shl.b64 %rd1208, %rd1393, 2; and.b64 %rd1209, %rd1208, 12; or.b64 %rd330, %rd312, %rd1209; setp.le.u64 %p836, %rd355, %rd330; @%p836 bra $L__BB0_650; mul.lo.s64 %rd1220, %rd330, 48; add.s64 %rd1211, %rd349, %rd1220; or.b64 %rd1221, %rd329, %rd294; shl.b64 %rd1222, %rd1221, 3; or.b64 %rd1223, %rd311, %rd1222; shl.b64 %rd1224, %rd1223, 6; mov.u64 %rd1225, _ZN16sparkl2d_kernels4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h86f513edd992d572E; add.s64 %rd1226, %rd1225, %rd1224; ld.shared.u32 %r1716, [%rd1226+20]; // begin inline asm cvta.to.global.u64 %rd1210, %rd1211;red.global.add.f32 [%rd1210], %r1716; // end inline asm add.s64 %rd1213, %rd1211, 4; ld.shared.u64 %rd1227, [%rd1226+24]; cvt.u32.u64 %r1717, %rd1227; shr.u64 %rd1228, %rd1227, 32; cvt.u32.u64 %r1718, %rd1228; // begin inline asm cvta.to.global.u64 %rd1212, %rd1213;red.global.add.f32 [%rd1212], %r1717; // end inline asm add.s64 %rd1215, %rd1211, 8; // begin inline asm cvta.to.global.u64 %rd1214, %rd1215;red.global.add.f32 [%rd1214], %r1718; // end inline asm add.s64 %rd1217, %rd1211, 12; ld.shared.u32 %r1719, [%rd1226+44]; // begin inline asm cvta.to.global.u64 %rd1216, %rd1217;red.global.add.f32 [%rd1216], %r1719; // end inline asm add.s64 %rd1219, %rd1211, 16; ld.shared.u32 %r1720, [%rd1226+40]; // begin inline asm cvta.to.global.u64 %rd1218, %rd1219;red.global.add.f32 [%rd1218], %r1720; // end inline asm $L__BB0_650: add.s64 %rd1393, %rd1393, 1; add.s64 %rd1392, %rd1392, 1; add.s64 %rd1391, %rd1391, 1; add.s64 %rd1390, %rd1390, 1; setp.lt.u64 %p837, %rd1394, %rd290; @%p837 bra $L__BB0_642; $L__BB0_651: ret; $L__BB0_245: abs.f32 %f5210, %f255; setp.neu.f32 %p350, %f5210, 0f7F800000; @%p350 bra $L__BB0_248; selp.f32 %f5506, 0fFF800000, 0f7F800000, %p8; $L__BB0_248: selp.f32 %f2520, 0f3F800000, %f5506, %p324; div.rn.f32 %f2521, %f2520, %f253; mul.f32 %f2522, %f297, %f2521; div.rn.f32 %f2523, %f270, %f281; div.rn.f32 %f2524, %f271, %f281; fma.rn.f32 %f2525, %f2523, %f2522, %f269; fma.rn.f32 %f2526, %f2524, %f2522, %f269; sqrt.rn.f32 %f2527, %f2525; sqrt.rn.f32 %f2528, %f2526; mov.b64 {%r876, %r877}, %rd145; mov.b64 {%r878, %r879}, %rd144; mov.b32 %f2529, %r878; mul.f32 %f2530, %f2529, %f2527; mov.b32 %f2531, %r879; mul.f32 %f2532, %f2531, %f2527; mov.b32 %f2533, %r876; mul.f32 %f2534, %f2533, %f2528; mov.b32 %f2535, %r877; mul.f32 %f2536, %f2535, %f2528; mov.b64 {%r880, %r881}, %rd147; mov.b64 {%r882, %r883}, %rd146; mov.b32 %f2537, %r882; mov.b32 %f2538, %r883; mul.f32 %f2539, %f2538, %f2534; mul.f32 %f2540, %f2538, %f2536; mov.b32 %f2541, %r880; mov.b32 %f2542, %r881; mul.f32 %f2543, %f2542, %f2534; mul.f32 %f2544, %f2542, %f2536; fma.rn.f32 %f2545, %f2537, %f2532, %f2540; mov.b32 %r884, %f2545; fma.rn.f32 %f2546, %f2537, %f2530, %f2539; mov.b32 %r885, %f2546; fma.rn.f32 %f2547, %f2541, %f2532, %f2544; mov.b32 %r886, %f2547; fma.rn.f32 %f2548, %f2541, %f2530, %f2543; mov.b32 %r887, %f2548; mov.b64 %rd1344, {%r887, %r886}; mov.b64 %rd1343, {%r885, %r884}; $L__BB0_283: add.u64 %rd1290, %SPL, 80; mov.b64 {%r948, %r949}, %rd1344; mov.b64 {%r950, %r951}, %rd1343; mov.b32 %f5531, %r951; mov.b32 %f5530, %r948; st.local.v2.u64 [%rd1290], {%rd1343, %rd1344}; st.f32 [%rd131], %f5517; bra.uni $L__BB0_341; $L__BB0_653: trap; $L__BB0_652: trap; $L__BB0_189: trap; $L__BB0_465: trap; } // .globl grid_update .visible .entry grid_update( .param .f32 grid_update_param_0, .param .align 8 .b8 grid_update_param_1[72], .param .u64 grid_update_param_2, .param .u64 grid_update_param_3, .param .align 4 .b8 grid_update_param_4[8] ) { .local .align 16 .b8 __local_depot1[736]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<349>; .reg .b16 %rs<120>; .reg .f32 %f<719>; .reg .b32 %r<660>; .reg .b64 %rd<1557>; mov.u64 %SPL, __local_depot1; cvta.local.u64 %SP, %SPL; ld.param.f32 %f187, [grid_update_param_0]; ld.param.u64 %rd61, [grid_update_param_2]; ld.param.u64 %rd58, [grid_update_param_3]; ld.param.f32 %f190, [grid_update_param_4+4]; ld.param.f32 %f189, [grid_update_param_4]; ld.param.u64 %rd614, [grid_update_param_1+64]; ld.param.u64 %rd609, [grid_update_param_1+16]; ld.param.u64 %rd608, [grid_update_param_1+8]; ld.param.f32 %f188, [grid_update_param_1]; cvta.to.global.u64 %rd1365, %rd61; add.u64 %rd2, %SPL, 16; add.u64 %rd3, %SPL, 16; add.u64 %rd4, %SPL, 208; add.u64 %rd7, %SPL, 0; add.u64 %rd8, %SPL, 0; add.u64 %rd9, %SPL, 0; add.u64 %rd10, %SPL, 0; add.u64 %rd11, %SPL, 0; add.u64 %rd12, %SPL, 0; add.u64 %rd13, %SPL, 16; add.u64 %rd14, %SPL, 160; add.u64 %rd15, %SPL, 176; add.u64 %rd16, %SPL, 208; add.u64 %rd17, %SPL, 728; mov.u32 %r190, %tid.y; mov.u32 %r191, %tid.x; mov.u32 %r192, %ctaid.x; cvt.u64.u32 %rd19, %r192; mul.wide.u32 %rd633, %r192, 16; cvt.u64.u32 %rd20, %r191; add.s64 %rd634, %rd20, %rd633; cvt.u64.u32 %rd21, %r190; mul.wide.u32 %rd635, %r190, 4; add.s64 %rd22, %rd634, %rd635; setp.le.u64 %p13, %rd614, %rd22; @%p13 bra $L__BB1_257; cvta.to.global.u64 %rd636, %rd609; mul.lo.s64 %rd637, %rd19, 24; add.s64 %rd638, %rd636, %rd637; ld.global.u64 %rd639, [%rd638]; cvta.to.global.u64 %rd640, %rd608; shr.u64 %rd641, %rd639, 30; and.b64 %rd642, %rd641, 17179869180; add.s64 %rd643, %rd21, %rd642; add.s64 %rd644, %rd643, -8589934592; shl.b64 %rd645, %rd639, 2; and.b64 %rd646, %rd645, 17179869180; add.s64 %rd647, %rd20, %rd646; add.s64 %rd648, %rd647, -8589934592; mul.lo.s64 %rd649, %rd22, 48; cvt.rn.f32.s64 %f191, %rd648; cvt.rn.f32.s64 %f192, %rd644; mul.f32 %f2, %f188, %f191; mul.f32 %f3, %f188, %f192; add.s64 %rd650, %rd640, %rd649; add.s64 %rd23, %rd650, 4; ld.global.f32 %f193, [%rd650]; mul.f32 %f194, %f189, %f193; mul.f32 %f195, %f190, %f193; ld.global.u32 %rd651, [%rd650+4]; ld.global.u32 %rd652, [%rd650+8]; bfi.b64 %rd653, %rd652, %rd651, 32, 32; cvt.u32.u64 %r193, %rd653; shr.u64 %rd654, %rd653, 32; cvt.u32.u64 %r194, %rd654; mov.b32 %f196, %r193; fma.rn.f32 %f197, %f194, %f187, %f196; mov.b32 %f198, %r194; fma.rn.f32 %f199, %f195, %f187, %f198; setp.eq.f32 %p14, %f193, 0f00000000; rcp.rn.f32 %f200, %f193; selp.f32 %f201, 0f00000000, %f200, %p14; mul.f32 %f4, %f201, %f197; mul.f32 %f5, %f201, %f199; ld.global.u64 %rd1553, [%rd650+24]; setp.ne.s64 %p15, %rd1553, 0; @%p15 bra $L__BB1_240; add.f32 %f6, %f188, %f188; add.s64 %rd26, %rd7, 8; add.s64 %rd27, %rd8, 8; add.s64 %rd28, %rd9, 8; add.s64 %rd29, %rd10, 8; add.s64 %rd30, %rd11, 8; add.s64 %rd31, %rd12, 8; add.s64 %rd32, %rd17, 8; mov.u64 %rd1368, 0; add.s64 %rd34, %rd4, 4; add.s64 %rd37, %rd4, 44; add.s64 %rd39, %rd2, 8; mov.u16 %rs1, 2; mov.pred %p287, 0; mov.pred %p272, -1; bra.uni $L__BB1_3; $L__BB1_237: add.s64 %rd1368, %rd57, 1; add.s64 %rd1365, %rd54, 280; and.b16 %rs1, %rs23, 1; mov.b32 %r522, %f176; cvt.u64.u32 %rd1237, %r522; mov.b32 %r523, %f175; cvt.u64.u32 %rd1238, %r523; bfi.b64 %rd49, %rd1237, %rd1238, 32, 32; mov.u64 %rd48, %rd57; $L__BB1_3: and.b16 %rs26, %rs1, 255; setp.eq.s16 %p16, %rs26, 2; selp.f32 %f8, 0f7F7FFFFF, %f699, %p16; $L__BB1_5: mov.u64 %rd57, %rd1368; mov.u64 %rd54, %rd1365; setp.eq.s64 %p17, %rd58, 0; @%p17 bra $L__BB1_238; add.s64 %rd58, %rd58, -1; setp.eq.s64 %p18, %rd61, 0; @%p18 bra $L__BB1_238; add.s64 %rd1368, %rd57, 1; add.s64 %rd1365, %rd54, 280; add.s64 %rd61, %rd61, 280; ld.global.u32 %r195, [%rd54+272]; setp.eq.s32 %p19, %r195, 3; @%p19 bra $L__BB1_5; ld.global.u16 %rs27, [%rd54]; setp.eq.s16 %p20, %rs27, 1; @%p20 bra $L__BB1_179; setp.eq.s16 %p21, %rs27, 2; @%p21 bra $L__BB1_68; setp.ne.s16 %p22, %rs27, 3; @%p22 bra $L__BB1_217; ld.global.u8 %rs2, [%rd54+24]; ld.global.f32 %f9, [%rd54+256]; sub.f32 %f203, %f2, %f9; ld.global.f32 %f10, [%rd54+260]; sub.f32 %f204, %f3, %f10; ld.global.f32 %f205, [%rd54+252]; ld.global.f32 %f11, [%rd54+248]; mul.f32 %f206, %f204, %f205; fma.rn.f32 %f12, %f203, %f11, %f206; mul.f32 %f207, %f203, %f205; mul.f32 %f208, %f204, %f11; sub.f32 %f13, %f208, %f207; mov.u32 %r23, 2; st.local.u32 [%rd4+20], %r23; ld.global.u64 %rd64, [%rd54+16]; setp.eq.s64 %p23, %rd64, 0; @%p23 bra $L__BB1_65; ld.global.u64 %rd65, [%rd54+8]; mov.u64 %rd1374, 1; bra.uni $L__BB1_13; $L__BB1_21: sub.f32 %f220, %f701, %f12; abs.f32 %f30, %f220; setp.le.f32 %p33, %f30, 0f34000000; @%p33 bra $L__BB1_23; abs.f32 %f221, %f701; abs.f32 %f222, %f12; setp.gt.f32 %p35, %f222, %f221; selp.f32 %f223, %f222, %f221, %p35; mul.f32 %f224, %f223, 0f34000000; setp.gtu.f32 %p36, %f30, %f224; @%p36 bra $L__BB1_27; bra.uni $L__BB1_23; $L__BB1_13: shl.b64 %rd662, %rd1374, 3; add.s64 %rd663, %rd65, %rd662; setp.eq.s64 %p24, %rd1374, %rd64; selp.b64 %rd664, 0, %rd1374, %p24; shl.b64 %rd665, %rd664, 3; add.s64 %rd666, %rd65, %rd665; ld.u32 %rd667, [%rd663+-8]; ld.u32 %rd668, [%rd663+-4]; bfi.b64 %rd68, %rd668, %rd667, 32, 32; ld.u32 %rd669, [%rd666]; ld.u32 %rd670, [%rd666+4]; bfi.b64 %rd69, %rd670, %rd669, 32, 32; cvt.u32.u64 %r6, %rd68; mov.b32 %f17, %r6; shr.u64 %rd671, %rd68, 32; cvt.u32.u64 %r216, %rd671; mov.b32 %f18, %r216; cvt.u32.u64 %r7, %rd69; shr.u64 %rd672, %rd69, 32; cvt.u32.u64 %r217, %rd672; mov.b32 %f19, %r7; sub.f32 %f20, %f19, %f17; mov.b32 %f210, %r217; sub.f32 %f21, %f210, %f18; sub.f32 %f211, %f12, %f17; sub.f32 %f212, %f13, %f18; mul.f32 %f213, %f21, %f212; fma.rn.f32 %f22, %f20, %f211, %f213; mul.f32 %f214, %f21, %f21; fma.rn.f32 %f215, %f20, %f20, %f214; add.f32 %f23, %f215, 0f00000000; setp.gtu.f32 %p25, %f22, 0f00000000; mov.b64 {%r218, %r588}, %rd68; mov.b64 {%r219, %r9}, %rd69; @%p25 bra $L__BB1_15; bra.uni $L__BB1_14; $L__BB1_15: setp.ltu.f32 %p26, %f22, %f23; @%p26 bra $L__BB1_17; bra.uni $L__BB1_16; $L__BB1_17: setp.eq.f32 %p27, %f23, 0f00000000; @%p27 bra $L__BB1_64; mov.b32 %f677, %r216; shr.u64 %rd1357, %rd69, 32; cvt.u32.u64 %r579, %rd1357; shr.u64 %rd1356, %rd68, 32; cvt.u32.u64 %r578, %rd1356; mov.b32 %f676, %r578; mov.b32 %f675, %r579; sub.f32 %f674, %f675, %f676; mov.b32 %f673, %r6; cvt.u32.u64 %r577, %rd69; cvt.u32.u64 %r576, %rd68; mov.b32 %f672, %r576; mov.b32 %f671, %r577; sub.f32 %f670, %f671, %f672; div.rn.f32 %f216, %f22, %f23; mov.f32 %f217, 0f3F800000; sub.f32 %f218, %f217, %f216; mov.b32 %r590, %f218; mov.b32 %r591, %f216; fma.rn.f32 %f701, %f670, %f216, %f672; mov.b32 %r587, %f701; fma.rn.f32 %f702, %f674, %f216, %f676; mov.b32 %r588, %f702; mov.u32 %r589, 1; bra.uni $L__BB1_19; $L__BB1_14: cvt.u32.u64 %r587, %rd68; mov.b32 %f701, %r587; mov.b32 %f702, %r588; mov.u32 %r589, 0; mov.u32 %r590, %r589; bra.uni $L__BB1_19; $L__BB1_16: cvt.u32.u64 %r575, %rd69; cvt.u32.u64 %r587, %rd69; mov.b32 %f701, %r587; mov.b32 %f702, %r9; mov.u32 %r590, 1; mov.u32 %r589, 0; mov.u32 %r588, %r9; $L__BB1_19: setp.eq.f32 %p28, %f12, %f701; @%p28 bra $L__BB1_23; bra.uni $L__BB1_20; $L__BB1_23: setp.eq.f32 %p38, %f702, %f13; mov.pred %p37, -1; mov.pred %p343, %p37; @%p38 bra $L__BB1_27; mov.b32 %r573, %f13; and.b32 %r572, %r573, 2147483647; mov.b32 %f668, %r572; setp.eq.f32 %p40, %f668, 0f7F800000; and.b32 %r228, %r588, 2147483647; mov.b32 %f225, %r228; setp.eq.f32 %p41, %f225, 0f7F800000; or.pred %p42, %p40, %p41; mov.pred %p343, 0; @%p42 bra $L__BB1_27; sub.f32 %f226, %f702, %f13; abs.f32 %f31, %f226; setp.le.f32 %p44, %f31, 0f34000000; mov.pred %p343, %p37; @%p44 bra $L__BB1_27; abs.f32 %f227, %f702; abs.f32 %f228, %f13; setp.gt.f32 %p45, %f228, %f227; selp.f32 %f229, %f228, %f227, %p45; mul.f32 %f230, %f229, 0f34000000; setp.le.f32 %p343, %f31, %f230; bra.uni $L__BB1_27; $L__BB1_20: mov.b32 %r571, %f12; and.b32 %r570, %r571, 2147483647; mov.b32 %f667, %r570; setp.eq.f32 %p30, %f667, 0f7F800000; and.b32 %r227, %r587, 2147483647; mov.b32 %f219, %r227; setp.eq.f32 %p31, %f219, 0f7F800000; or.pred %p32, %p30, %p31; mov.pred %p343, 0; @%p32 bra $L__BB1_27; bra.uni $L__BB1_21; $L__BB1_27: cvt.u64.u32 %rd673, %r588; cvt.u64.u32 %rd674, %r587; bfi.b64 %rd70, %rd673, %rd674, 32, 32; mov.b64 {%r229, %r230}, %rd70; selp.u64 %rd71, 1, 0, %p343; mov.b32 %f33, %r230; mov.b32 %f32, %r229; sub.f32 %f231, %f32, %f12; sub.f32 %f232, %f33, %f13; mul.f32 %f233, %f232, %f232; fma.rn.f32 %f234, %f231, %f231, %f233; add.f32 %f235, %f234, 0f00000000; sqrt.rn.f32 %f35, %f235; setp.geu.f32 %p46, %f35, %f703; setp.ne.s32 %p47, %r23, 2; and.pred %p48, %p47, %p46; @%p48 bra $L__BB1_29; shr.u64 %rd1355, %rd69, 32; shr.u64 %rd1354, %rd68, 32; add.s64 %rd1375, %rd1374, -1; st.local.u64 [%rd4], %rd1375; st.local.v2.f32 [%rd4+8], {%f32, %f33}; mov.b64 {%r233, %r234}, %rd71; st.local.v2.u32 [%rd4+16], {%r233, %r589}; st.local.v2.u32 [%rd4+24], {%r590, %r591}; st.local.f32 [%rd4+32], %f35; st.local.u32 [%rd4+36], %rd68; st.local.u32 [%rd4+44], %rd69; st.local.u32 [%rd4+40], %rd1354; st.local.u32 [%rd4+48], %rd1355; mov.f32 %f703, %f35; mov.u32 %r23, %r589; $L__BB1_29: add.s64 %rd74, %rd1374, 1; setp.lt.u64 %p49, %rd1374, %rd64; mov.u64 %rd1374, %rd74; @%p49 bra $L__BB1_13; ld.local.u32 %rd681, [%rd4+36]; ld.local.u32 %rd682, [%rd4+40]; bfi.b64 %rd683, %rd682, %rd681, 32, 32; mov.u64 %rd680, 0; cvt.u32.u64 %r235, %rd683; mov.b32 %f236, %r235; shr.u64 %rd684, %rd683, 32; cvt.u32.u64 %r236, %rd684; mov.b32 %f237, %r236; ld.local.u32 %rd685, [%rd4+44]; ld.local.u32 %rd686, [%rd4+48]; bfi.b64 %rd687, %rd686, %rd685, 32, 32; cvt.u32.u64 %r237, %rd687; shr.u64 %rd688, %rd687, 32; cvt.u32.u64 %r238, %rd688; mov.b32 %f238, %r237; sub.f32 %f37, %f238, %f236; mov.b32 %f239, %r238; sub.f32 %f38, %f239, %f237; mul.f32 %f240, %f38, %f38; fma.rn.f32 %f241, %f37, %f37, %f240; add.f32 %f39, %f241, 0f00000000; setp.leu.f32 %p50, %f39, 0f28800000; mov.u64 %rd1376, %rd680; mov.u64 %rd1377, %rd680; mov.u64 %rd1378, %rd680; @%p50 bra $L__BB1_32; neg.f32 %f242, %f37; sqrt.rn.f32 %f243, %f39; div.rn.f32 %f244, %f38, %f243; div.rn.f32 %f245, %f242, %f243; mov.b32 %r239, %f245; mov.b32 %r240, %f244; mov.u64 %rd1378, 1; mov.b64 %rd691, {%r240, %r239}; shr.u64 %rd1377, %rd691, 32; shl.b64 %rd1376, %rd691, 32; $L__BB1_32: or.b64 %rd81, %rd1378, %rd1376; or.b64 %rd82, %rd680, %rd1377; and.b64 %rd692, %rd680, 4294967295; xor.b64 %rd693, %rd1378, 1; or.b64 %rd694, %rd693, %rd692; setp.ne.s64 %p51, %rd694, 0; @%p51 bra $L__BB1_63; mov.b64 {%r241, %r242}, %rd82; mov.b64 {%r243, %r244}, %rd81; mov.b32 %f40, %r244; mov.b32 %f41, %r241; setp.eq.s32 %p52, %r23, 1; @%p52 bra $L__BB1_61; bra.uni $L__BB1_34; $L__BB1_61: ld.local.u64 %rd771, [%rd4+8]; cvt.u32.u64 %r265, %rd771; mov.b32 %f273, %r265; shr.u64 %rd772, %rd771, 32; cvt.u32.u64 %r266, %rd772; mov.b32 %f274, %r266; sub.f32 %f275, %f2, %f273; sub.f32 %f276, %f3, %f274; mul.f32 %f277, %f41, %f276; fma.rn.f32 %f278, %f40, %f275, %f277; setp.le.f32 %p344, %f278, 0f00000000; bra.uni $L__BB1_62; $L__BB1_68: ld.global.f32 %f289, [%rd54+256]; mov.u64 %rd1532, 0; sub.f32 %f290, %f2, %f289; ld.global.f32 %f291, [%rd54+260]; sub.f32 %f292, %f3, %f291; ld.global.f32 %f293, [%rd54+252]; ld.global.f32 %f294, [%rd54+248]; mul.f32 %f295, %f292, %f293; fma.rn.f32 %f49, %f290, %f294, %f295; mul.f32 %f296, %f290, %f293; mul.f32 %f297, %f292, %f294; sub.f32 %f50, %f297, %f296; mov.b32 %r274, %f49; mov.b32 %r275, %f50; cvt.u64.u32 %rd793, %r275; cvt.u64.u32 %rd794, %r274; bfi.b64 %rd795, %rd793, %rd794, 32, 32; st.local.u64 [%rd17], %rd795; ld.global.u64 %rd184, [%rd54+32]; setp.eq.s64 %p73, %rd184, 0; mov.u64 %rd1533, 2; mov.u64 %rd1534, %rd1532; @%p73 bra $L__BB1_174; mov.u32 %r282, 0; st.local.u32 [%rd16], %r282; mov.u32 %r283, -16777217; st.local.u32 [%rd16+4], %r283; mov.u32 %r45, 1; st.local.u32 [%rd16+512], %r45; ld.global.u64 %rd186, [%rd54+24]; ld.global.u64 %rd187, [%rd54+80]; ld.global.u64 %rd188, [%rd54+72]; mov.u32 %r43, 2139095039; mov.u32 %r42, 4; bra.uni $L__BB1_71; $L__BB1_179: mov.u16 %rs114, 2; ld.global.f32 %f116, [%rd54+256]; sub.f32 %f493, %f2, %f116; ld.global.f32 %f117, [%rd54+260]; sub.f32 %f494, %f3, %f117; ld.global.f32 %f495, [%rd54+252]; ld.global.f32 %f118, [%rd54+248]; mul.f32 %f496, %f494, %f495; fma.rn.f32 %f119, %f493, %f118, %f496; mul.f32 %f497, %f493, %f495; mul.f32 %f498, %f494, %f118; sub.f32 %f120, %f498, %f497; mov.b32 %r146, %f119; mov.b32 %r147, %f120; ld.global.u64 %rd548, [%rd54+56]; ld.global.u64 %rd547, [%rd54+48]; sub.f32 %f499, %f119, %f6; sub.f32 %f500, %f120, %f6; mov.b32 %r454, %f499; mov.b32 %r455, %f500; cvt.u64.u32 %rd1164, %r455; cvt.u64.u32 %rd1165, %r454; add.f32 %f501, %f6, %f119; add.f32 %f502, %f6, %f120; mov.b32 %r456, %f501; mov.b32 %r457, %f502; cvt.u64.u32 %rd1166, %r457; cvt.u64.u32 %rd1167, %r456; bfi.b64 %rd1168, %rd1164, %rd1165, 32, 32; mov.b64 {%r458, %r459}, %rd1168; bfi.b64 %rd1169, %rd1166, %rd1167, 32, 32; mov.b64 {%r460, %r461}, %rd1169; st.local.u8 [%rd16+8], %rs114; mov.b32 %f124, %r461; mov.b32 %f122, %r459; mov.b32 %f123, %r460; mov.b32 %f121, %r458; ld.global.v2.f32 {%f503, %f504}, [%rd54+40]; div.rn.f32 %f127, %f121, %f503; div.rn.f32 %f128, %f123, %f503; ld.global.u64 %rd550, [%rd54+16]; cvt.rn.f32.u64 %f505, %rd550; add.f32 %f506, %f505, 0fBF800000; rcp.rn.f32 %f129, %f506; setp.lt.f32 %p240, %f128, 0fBF000000; setp.gt.f32 %p241, %f127, 0f3F000000; or.pred %p242, %p241, %p240; @%p242 bra $L__BB1_211; add.f32 %f507, %f127, 0f3F000000; div.rn.f32 %f508, %f507, %f129; cvt.rmi.f32.f32 %f509, %f508; add.s64 %rd1170, %rd550, -2; cvt.rn.f32.u64 %f510, %rd1170; setp.gt.f32 %p243, %f509, 0f00000000; setp.lt.f32 %p244, %f509, %f510; selp.f32 %f511, %f509, %f510, %p244; selp.f32 %f512, %f511, 0f00000000, %p243; setp.gt.f32 %p245, %f512, 0f5F7FFFFF; max.f32 %f513, %f512, 0f00000000; cvt.rzi.u64.f32 %rd1171, %f513; selp.b64 %rd556, -1, %rd1171, %p245; add.f32 %f514, %f128, 0f3F000000; div.rn.f32 %f515, %f514, %f129; cvt.rpi.f32.f32 %f516, %f515; add.s64 %rd1172, %rd550, -1; cvt.rn.f32.u64 %f517, %rd1172; setp.gt.f32 %p246, %f516, 0f00000000; setp.lt.f32 %p247, %f516, %f517; selp.f32 %f518, %f516, %f517, %p247; selp.f32 %f519, %f518, 0f00000000, %p246; setp.gt.f32 %p248, %f519, 0f5F7FFFFF; max.f32 %f520, %f519, 0f00000000; cvt.rzi.u64.f32 %rd1173, %f520; selp.b64 %rd552, -1, %rd1173, %p248; setp.ge.u64 %p249, %rd556, %rd552; @%p249 bra $L__BB1_211; div.rn.f32 %f130, %f122, %f504; div.rn.f32 %f131, %f124, %f504; ld.global.u64 %rd553, [%rd54+32]; ld.global.u64 %rd554, [%rd54+24]; ld.global.u64 %rd555, [%rd54+8]; ld.local.v4.u32 {%r652, %r653, %r654, %r467}, [%rd16]; mov.f32 %f715, 0f7F7FFFFF; bra.uni $L__BB1_182; $L__BB1_217: add.s64 %rd1541, %rd3, 8; add.u64 %rd1544, %SP, 16; ld.global.f32 %f158, [%rd54+256]; sub.f32 %f562, %f2, %f158; ld.global.f32 %f159, [%rd54+260]; sub.f32 %f563, %f3, %f159; ld.global.f32 %f160, [%rd54+252]; ld.global.f32 %f161, [%rd54+248]; mul.f32 %f564, %f563, %f160; fma.rn.f32 %f162, %f562, %f161, %f564; mul.f32 %f565, %f562, %f160; mul.f32 %f566, %f563, %f161; sub.f32 %f163, %f566, %f565; ld.global.u32 %rd1200, [%rd54+8]; ld.global.u32 %rd1201, [%rd54+12]; bfi.b64 %rd1202, %rd1201, %rd1200, 32, 32; cvt.u32.u64 %r506, %rd1202; mov.b32 %f567, %r506; shr.u64 %rd1203, %rd1202, 32; cvt.u32.u64 %r507, %rd1203; mov.b32 %f568, %r507; neg.f32 %f569, %f567; neg.f32 %f570, %f568; sub.f32 %f164, %f569, %f162; sub.f32 %f165, %f570, %f163; sub.f32 %f166, %f162, %f567; sub.f32 %f167, %f163, %f568; setp.ge.f32 %p298, %f164, 0f00000000; selp.f32 %f571, %f164, 0f00000000, %p298; setp.ge.f32 %p299, %f165, 0f00000000; selp.f32 %f572, %f165, 0f00000000, %p299; setp.ge.f32 %p300, %f166, 0f00000000; selp.f32 %f573, %f166, 0f00000000, %p300; setp.ge.f32 %p301, %f167, 0f00000000; selp.f32 %f574, %f167, 0f00000000, %p301; sub.f32 %f168, %f571, %f573; mov.b32 %r508, %f168; sub.f32 %f169, %f572, %f574; mov.b32 %r509, %f169; cvt.u64.u32 %rd1204, %r509; cvt.u64.u32 %rd1205, %r508; bfi.b64 %rd1206, %rd1204, %rd1205, 32, 32; st.local.u64 [%rd3], %rd1206; mov.u64 %rd1548, 2; mov.u64 %rd1542, %rd3; mov.u64 %rd1543, %rd3; mov.u64 %rd1545, %rd3; mov.u64 %rd1546, %rd3; mov.u64 %rd1547, %rd1544; $L__BB1_218: setp.eq.s64 %p302, %rd1548, 0; @%p302 bra $L__BB1_221; add.s64 %rd1548, %rd1548, -1; add.s64 %rd1207, %rd1545, 8; setp.eq.s64 %p303, %rd1545, %rd1541; selp.b64 %rd1541, %rd1207, %rd1541, %p303; add.s64 %rd1208, %rd1542, 8; selp.b64 %rd1542, %rd1208, %rd1542, %p303; add.s64 %rd1209, %rd1543, 8; selp.b64 %rd1543, %rd1209, %rd1543, %p303; add.s64 %rd1210, %rd1544, 8; selp.b64 %rd1544, %rd1210, %rd1544, %p303; selp.b64 %rd1211, %rd1208, %rd1545, %p303; selp.b64 %rd1212, %rd1209, %rd1546, %p303; selp.b64 %rd1213, %rd1210, %rd1547, %p303; setp.eq.s64 %p304, %rd1548, 0; add.s64 %rd1214, %rd1211, 4; add.s64 %rd1215, %rd1212, 4; add.s64 %rd1216, %rd1213, 4; selp.b64 %rd1545, %rd1211, %rd1214, %p304; selp.b64 %rd1546, %rd1212, %rd1215, %p304; selp.b64 %rd1547, %rd1213, %rd1216, %p304; ld.local.f32 %f575, [%rd1212]; setp.eq.f32 %p305, %f575, 0f00000000; @%p305 bra $L__BB1_218; add.f32 %f576, %f162, %f168; mov.b32 %r510, %f576; add.f32 %f577, %f163, %f169; mov.b32 %r511, %f577; cvt.u64.u32 %rd1219, %r511; cvt.u64.u32 %rd1220, %r510; bfi.b64 %rd1551, %rd1219, %rd1220, 32, 32; mov.u64 %rd1552, 0; bra.uni $L__BB1_234; $L__BB1_221: setp.lt.f32 %p306, %f164, %f166; mov.f32 %f716, 0fFF7FFFFF; @%p306 bra $L__BB1_224; bra.uni $L__BB1_222; $L__BB1_224: setp.leu.f32 %p311, %f166, 0fFF7FFFFF; mov.pred %p348, 0; @%p311 bra $L__BB1_226; mov.f32 %f716, %f166; bra.uni $L__BB1_226; $L__BB1_222: setp.leu.f32 %p308, %f164, 0fFF7FFFFF; mov.pred %p348, 0; @%p308 bra $L__BB1_226; mov.pred %p348, -1; mov.f32 %f716, %f164; $L__BB1_226: setp.lt.f32 %p313, %f165, %f167; @%p313 bra $L__BB1_229; bra.uni $L__BB1_227; $L__BB1_229: setp.gt.f32 %p315, %f167, %f716; @%p315 bra $L__BB1_232; bra.uni $L__BB1_230; $L__BB1_232: mov.u64 %rd1223, 0; st.local.u64 [%rd4], %rd1223; neg.f32 %f718, %f167; mov.u64 %rd1550, %rd34; bra.uni $L__BB1_233; $L__BB1_227: setp.leu.f32 %p314, %f165, %f716; @%p314 bra $L__BB1_230; mov.u64 %rd1221, 0; st.local.u64 [%rd4], %rd1221; mov.u64 %rd1550, %rd34; mov.f32 %f716, %f165; bra.uni $L__BB1_231; $L__BB1_230: mov.u64 %rd1222, 0; st.local.u64 [%rd4], %rd1222; neg.f32 %f718, %f716; not.pred %p316, %p348; mov.u64 %rd1550, %rd4; @%p316 bra $L__BB1_233; $L__BB1_231: mov.f32 %f718, %f716; $L__BB1_233: st.local.f32 [%rd1550], %f718; ld.local.u64 %rd1226, [%rd4]; cvt.u32.u64 %r512, %rd1226; mov.b32 %f580, %r512; shr.u64 %rd1227, %rd1226, 32; cvt.u32.u64 %r513, %rd1227; mov.b32 %f581, %r513; add.f32 %f582, %f162, %f580; add.f32 %f583, %f163, %f581; mov.b32 %r514, %f582; mov.b32 %r515, %f583; cvt.u64.u32 %rd1228, %r515; cvt.u64.u32 %rd1229, %r514; bfi.b64 %rd1551, %rd1228, %rd1229, 32, 32; mov.u64 %rd1552, 1; $L__BB1_234: mov.u64 %rd1297, 0; cvt.u32.u64 %r516, %rd1551; mov.b32 %f584, %r516; shr.u64 %rd1230, %rd1551, 32; cvt.u32.u64 %r517, %rd1230; mov.b32 %f585, %r517; mul.f32 %f586, %f161, %f584; mul.f32 %f587, %f160, %f585; sub.f32 %f588, %f586, %f587; mul.f32 %f589, %f161, %f585; fma.rn.f32 %f590, %f160, %f584, %f589; add.f32 %f591, %f158, %f588; mov.b32 %r518, %f591; add.f32 %f592, %f159, %f590; mov.b32 %r519, %f592; cvt.u64.u32 %rd1231, %r519; cvt.u64.u32 %rd1232, %r518; bfi.b64 %rd1233, %rd1231, %rd1232, 32, 32; or.b64 %rd1234, %rd1297, %rd1233; mov.b64 {%r655, %r656}, %rd1234; mov.b64 {%r657, %r520}, %rd1552; bra.uni $L__BB1_235; $L__BB1_199: sub.f32 %f533, %f713, %f119; abs.f32 %f151, %f533; setp.le.f32 %p268, %f151, 0f34000000; @%p268 bra $L__BB1_201; abs.f32 %f534, %f713; abs.f32 %f535, %f119; setp.gt.f32 %p270, %f535, %f534; selp.f32 %f536, %f535, %f534, %p270; mul.f32 %f537, %f536, 0f34000000; setp.gtu.f32 %p271, %f151, %f537; mov.pred %p346, %p287; @%p271 bra $L__BB1_205; bra.uni $L__BB1_201; $L__BB1_182: setp.gt.u64 %p250, %rd553, %rd556; @%p250 bra $L__BB1_184; bra.uni $L__BB1_183; $L__BB1_184: add.s64 %rd1174, %rd554, %rd556; ld.u8 %rs94, [%rd1174]; setp.eq.s16 %p251, %rs94, 0; @%p251 bra $L__BB1_209; cvt.rn.f32.u64 %f522, %rd556; fma.rn.f32 %f135, %f129, %f522, 0fBF000000; setp.gt.u64 %p252, %rd550, %rd556; @%p252 bra $L__BB1_187; bra.uni $L__BB1_186; $L__BB1_187: shl.b64 %rd1175, %rd556, 2; add.s64 %rd557, %rd555, %rd1175; ld.f32 %f136, [%rd557]; add.s64 %rd1176, %rd556, 1; setp.gt.u64 %p253, %rd550, %rd1176; @%p253 bra $L__BB1_189; bra.uni $L__BB1_188; $L__BB1_189: ld.f32 %f137, [%rd557+4]; setp.gt.f32 %p254, %f137, %f131; setp.gt.f32 %p255, %f136, %f131; and.pred %p256, %p255, %p254; @%p256 bra $L__BB1_209; setp.lt.f32 %p257, %f136, %f130; setp.lt.f32 %p258, %f137, %f130; and.pred %p259, %p257, %p258; @%p259 bra $L__BB1_209; cvt.rn.f32.u64 %f693, %rd556; fma.rn.f32 %f692, %f129, %f693, 0fBF000000; mul.f32 %f523, %f503, %f692; mov.b32 %r468, %f523; mul.f32 %f140, %f504, %f136; mov.b32 %r469, %f140; cvt.u64.u32 %rd1177, %r469; cvt.u64.u32 %rd1178, %r468; add.f32 %f524, %f129, %f692; mul.f32 %f138, %f503, %f524; mov.b32 %r650, %f138; mul.f32 %f525, %f504, %f137; mov.b32 %r470, %f525; cvt.u64.u32 %rd1179, %r470; cvt.u64.u32 %rd1180, %r650; bfi.b64 %rd1181, %rd1179, %rd1180, 32, 32; bfi.b64 %rd1182, %rd1177, %rd1178, 32, 32; cvt.u32.u64 %r155, %rd1182; mov.b32 %f139, %r155; sub.f32 %f141, %f138, %f139; sub.f32 %f142, %f525, %f140; sub.f32 %f526, %f119, %f139; sub.f32 %f527, %f120, %f140; mul.f32 %f528, %f142, %f527; fma.rn.f32 %f143, %f141, %f526, %f528; mul.f32 %f529, %f142, %f142; fma.rn.f32 %f530, %f141, %f141, %f529; add.f32 %f144, %f530, 0f00000000; setp.gtu.f32 %p260, %f143, 0f00000000; mov.b64 {%r471, %r651}, %rd1182; mov.b64 {%r472, %r157}, %rd1181; @%p260 bra $L__BB1_193; bra.uni $L__BB1_192; $L__BB1_193: setp.ltu.f32 %p261, %f143, %f144; @%p261 bra $L__BB1_195; bra.uni $L__BB1_194; $L__BB1_195: setp.eq.f32 %p262, %f144, 0f00000000; @%p262 bra $L__BB1_208; cvt.u32.u64 %r585, %rd1182; mov.b32 %f698, %r585; div.rn.f32 %f531, %f143, %f144; fma.rn.f32 %f713, %f141, %f531, %f698; mov.b32 %r650, %f713; fma.rn.f32 %f714, %f142, %f531, %f140; mov.b32 %r651, %f714; bra.uni $L__BB1_197; $L__BB1_192: cvt.u32.u64 %r650, %rd1182; mov.b32 %f713, %r650; mov.b32 %f714, %r651; bra.uni $L__BB1_197; $L__BB1_194: cvt.rn.f32.u64 %f697, %rd556; add.f32 %f696, %f129, %f692; mul.f32 %f713, %f503, %f696; mov.b32 %f714, %r157; mov.u32 %r651, %r157; $L__BB1_197: setp.eq.f32 %p263, %f119, %f713; @%p263 bra $L__BB1_201; bra.uni $L__BB1_198; $L__BB1_201: setp.eq.f32 %p273, %f714, %f120; mov.pred %p346, %p272; @%p273 bra $L__BB1_205; and.b32 %r584, %r147, 2147483647; mov.b32 %f694, %r584; setp.eq.f32 %p275, %f694, 0f7F800000; and.b32 %r474, %r651, 2147483647; mov.b32 %f538, %r474; setp.eq.f32 %p276, %f538, 0f7F800000; or.pred %p277, %p275, %p276; mov.pred %p346, %p287; @%p277 bra $L__BB1_205; sub.f32 %f539, %f714, %f120; abs.f32 %f152, %f539; setp.le.f32 %p279, %f152, 0f34000000; mov.pred %p346, %p272; @%p279 bra $L__BB1_205; abs.f32 %f540, %f714; abs.f32 %f541, %f120; setp.gt.f32 %p280, %f541, %f540; selp.f32 %f542, %f541, %f540, %p280; mul.f32 %f543, %f542, 0f34000000; setp.le.f32 %p346, %f152, %f543; bra.uni $L__BB1_205; $L__BB1_198: and.b32 %r583, %r146, 2147483647; mov.b32 %f691, %r583; setp.eq.f32 %p265, %f691, 0f7F800000; and.b32 %r473, %r650, 2147483647; mov.b32 %f532, %r473; setp.eq.f32 %p266, %f532, 0f7F800000; or.pred %p267, %p265, %p266; mov.pred %p346, %p287; @%p267 bra $L__BB1_205; bra.uni $L__BB1_199; $L__BB1_205: cvt.u64.u32 %rd1183, %r651; cvt.u64.u32 %rd1184, %r650; bfi.b64 %rd558, %rd1183, %rd1184, 32, 32; mov.b64 {%r475, %r476}, %rd558; selp.u64 %rd559, 1, 0, %p346; mov.b32 %f544, %r475; sub.f32 %f545, %f544, %f119; mov.b32 %f546, %r476; sub.f32 %f547, %f546, %f120; mul.f32 %f548, %f547, %f547; fma.rn.f32 %f549, %f545, %f545, %f548; add.f32 %f153, %f549, 0f00000000; setp.geu.f32 %p281, %f153, %f715; @%p281 bra $L__BB1_209; sqrt.rn.f32 %f550, %f153; setp.gtu.f32 %p282, %f550, %f6; mov.f32 %f715, %f153; @%p282 bra $L__BB1_209; mov.b64 {%r654, %r477}, %rd559; mov.u32 %r652, %r475; mov.u32 %r653, %r476; mov.f32 %f715, %f153; $L__BB1_209: add.s64 %rd556, %rd556, 1; setp.lt.u64 %p283, %rd556, %rd552; @%p283 bra $L__BB1_182; st.local.u32 [%rd16+8], %r654; mov.b64 %rd1185, {%r652, %r653}; st.local.u64 [%rd16], %rd1185; $L__BB1_211: cvt.u64.u32 %rd1186, %r146; cvt.u64.u32 %rd1187, %r147; bfi.b64 %rd561, %rd1187, %rd1186, 32, 32; ld.local.v4.u32 {%r481, %r482, %r483, %r484}, [%rd16]; mov.b64 %rd563, {%r483, %r484}; mov.b64 %rd562, {%r481, %r482}; mov.b32 {%rs95, %rs96}, %r483; and.b16 %rs97, %rs95, 255; setp.eq.s16 %p284, %rs97, 2; cvt.u64.u16 %rd1188, %rs95; and.b64 %rd1189, %rd1188, 255; selp.b64 %rd1190, 2, %rd1189, %p284; and.b64 %rd1191, %rd563, 4294967040; or.b64 %rd1192, %rd1191, %rd1190; mov.b64 {%r489, %r490}, %rd1192; mov.b32 {%rs119, %rs98}, %r489; and.b16 %rs99, %rs119, 255; setp.eq.s16 %p285, %rs99, 2; mov.u32 %r657, 2; mov.u32 %r655, 0; mov.u32 %r656, %r655; @%p285 bra $L__BB1_235; ld.global.u8 %rs100, [%rd54+64]; setp.eq.s16 %p286, %rs100, 0; shr.u64 %rd1193, %rd562, 32; cvt.u32.u64 %r491, %rd1193; mov.b32 %f155, %r491; @%p286 bra $L__BB1_216; mov.b64 {%r492, %r493}, %rd561; mov.b32 %f157, %r493; mov.b32 %f156, %r492; mov.b64 {%r494, %r495}, %rd547; mov.b64 {%r496, %r497}, %rd548; ld.global.u8 %rs20, [%rd54+65]; mov.b32 %f551, %r496; setp.gt.f32 %p288, %f156, %f551; mov.b32 %f552, %r494; setp.lt.f32 %p289, %f156, %f552; or.pred %p290, %p289, %p288; mov.pred %p347, %p287; @%p290 bra $L__BB1_215; setp.geu.f32 %p291, %f157, 0fFF7FFFFF; setp.leu.f32 %p292, %f157, 0f7F7FFFFF; and.pred %p347, %p292, %p291; $L__BB1_215: setp.ge.f32 %p293, %f120, %f155; setp.le.f32 %p294, %f120, %f155; setp.eq.s16 %p295, %rs20, 0; selp.u32 %r498, -1, 0, %p293; selp.u32 %r499, -1, 0, %p294; selp.b32 %r500, %r499, %r498, %p295; and.b32 %r501, %r500, 1; setp.eq.b32 %p296, %r501, 1; and.pred %p297, %p296, %p347; selp.u16 %rs119, 1, 0, %p297; $L__BB1_216: cvt.u32.u64 %r502, %rd562; mov.b32 %f553, %r502; mul.f32 %f554, %f118, %f553; ld.global.f32 %f555, [%rd54+252]; mul.f32 %f556, %f555, %f155; sub.f32 %f557, %f554, %f556; mul.f32 %f558, %f555, %f553; fma.rn.f32 %f559, %f118, %f155, %f558; add.f32 %f560, %f116, %f557; mov.b32 %r503, %f560; add.f32 %f561, %f117, %f559; mov.b32 %r504, %f561; cvt.u64.u32 %rd1194, %r504; cvt.u64.u32 %rd1195, %r503; cvt.u64.u16 %rd1196, %rs119; bfi.b64 %rd1197, %rd1194, %rd1195, 32, 32; and.b64 %rd1198, %rd1196, 255; mov.b64 {%r655, %r656}, %rd1197; mov.b64 {%r657, %r505}, %rd1198; bra.uni $L__BB1_235; $L__BB1_34: ld.local.u32 %r245, [%rd4+24]; setp.eq.s32 %p53, %r245, 0; @%p53 bra $L__BB1_47; setp.ne.s32 %p54, %r245, 1; @%p54 bra $L__BB1_60; add.s64 %rd83, %rd1375, 1; or.b64 %rd695, %rd83, %rd64; and.b64 %rd696, %rd695, -4294967296; setp.eq.s64 %p55, %rd696, 0; @%p55 bra $L__BB1_38; rem.u64 %rd1379, %rd83, %rd64; bra.uni $L__BB1_39; $L__BB1_47: setp.eq.s64 %p62, %rd1375, 0; selp.b64 %rd130, %rd64, %rd1375, %p62; add.s64 %rd735, %rd130, -1; setp.gt.u64 %p63, %rd64, %rd735; @%p63 bra $L__BB1_49; bra.uni $L__BB1_48; $L__BB1_49: shl.b64 %rd736, %rd130, 3; add.s64 %rd737, %rd65, %rd736; ld.u32 %rd738, [%rd737+-8]; ld.u32 %rd739, [%rd737+-4]; bfi.b64 %rd131, %rd739, %rd738, 32, 32; or.b64 %rd740, %rd130, %rd64; and.b64 %rd741, %rd740, -4294967296; setp.eq.s64 %p64, %rd741, 0; @%p64 bra $L__BB1_51; rem.u64 %rd1396, %rd130, %rd64; bra.uni $L__BB1_52; $L__BB1_165: ld.u32 %r431, [%rd196+76]; cvt.u64.u32 %rd1105, %r431; setp.le.u64 %p230, %rd187, %rd1105; mul.wide.u32 %rd1106, %r431, 12; add.s64 %rd1107, %rd188, %rd1106; setp.eq.s64 %p231, %rd1107, 0; or.pred %p232, %p230, %p231; selp.b32 %r40, %r40, %r611, %p232; selp.b32 %r39, %r39, %r610, %p232; selp.b32 %r38, %r38, %r609, %p232; selp.b32 %r42, %r42, %r624, %p232; selp.b32 %r43, %r43, %r92, %p232; $L__BB1_71: mov.u32 %r44, %r45; setp.eq.s32 %p74, %r44, 0; @%p74 bra $L__BB1_172; mov.b32 %f640, %r43; cvt.u64.u32 %rd797, %r44; add.s64 %rd798, %rd797, -1; cvt.u32.u64 %r45, %rd798; st.local.u32 [%rd16+512], %r45; mul.wide.u32 %rd799, %r44, 8; add.s64 %rd800, %rd16, %rd799; ld.local.u32 %rd194, [%rd800+-4]; ld.local.u32 %rd801, [%rd800+-8]; shl.b64 %rd802, %rd801, 32; or.b64 %rd193, %rd802, 1; mov.b64 {%r287, %r288}, %rd194; mov.b32 %f298, %r287; neg.f32 %f299, %f298; setp.le.f32 %p75, %f640, %f299; @%p75 bra $L__BB1_71; mov.b64 {%r289, %r290}, %rd193; cvt.u64.u32 %rd195, %r290; setp.gt.u64 %p76, %rd184, %rd195; @%p76 bra $L__BB1_75; bra.uni $L__BB1_74; $L__BB1_75: mul.lo.s64 %rd803, %rd195, 96; add.s64 %rd196, %rd186, %rd803; ld.u8 %rs35, [%rd196+88]; and.b16 %rs36, %rs35, 1; setp.eq.b16 %p78, %rs36, 1; xor.pred %p79, %p78, %p287; not.pred %p80, %p79; mov.pred %p345, %p287; @%p80 bra $L__BB1_77; ld.v4.u32 {%r291, %r292, %r293, %r294}, [%rd196+64]; cvt.u64.u32 %rd804, %r291; setp.gt.u64 %p82, %rd187, %rd804; mul.wide.u32 %rd805, %r291, 12; add.s64 %rd806, %rd188, %rd805; selp.b64 %rd807, %rd806, 0, %p82; setp.eq.s64 %p83, %rd807, 0; add.s64 %rd808, %rd807, 8; selp.b64 %rd1417, 0, %rd808, %p83; cvt.u64.u32 %rd809, %r292; setp.gt.u64 %p84, %rd187, %rd809; mul.wide.u32 %rd810, %r292, 12; add.s64 %rd811, %rd188, %rd810; selp.b64 %rd812, %rd811, 0, %p84; setp.eq.s64 %p85, %rd812, 0; add.s64 %rd813, %rd812, 8; selp.b64 %rd1416, 0, %rd813, %p85; ld.u32 %r298, [%rd196+72]; cvt.u64.u32 %rd814, %r298; setp.gt.u64 %p86, %rd187, %rd814; mul.wide.u32 %rd815, %r298, 12; add.s64 %rd816, %rd188, %rd815; selp.b64 %rd817, %rd816, 0, %p86; setp.eq.s64 %p87, %rd817, 0; add.s64 %rd818, %rd817, 8; selp.b64 %rd1415, 0, %rd818, %p87; cvt.u64.u32 %rd819, %r294; setp.gt.u64 %p88, %rd187, %rd819; mul.wide.u32 %rd820, %r294, 12; add.s64 %rd821, %rd188, %rd820; selp.b64 %rd822, %rd821, 0, %p88; setp.eq.s64 %p89, %rd822, 0; add.s64 %rd823, %rd822, 8; selp.b64 %rd1414, 0, %rd823, %p89; mov.pred %p345, %p272; $L__BB1_77: mov.b32 %f641, %r43; ld.v4.f32 {%f300, %f301, %f302, %f303}, [%rd196]; sub.f32 %f308, %f300, %f49; sub.f32 %f309, %f301, %f49; sub.f32 %f310, %f302, %f49; sub.f32 %f311, %f303, %f49; ld.v4.f32 {%f312, %f313, %f314, %f315}, [%rd196+16]; sub.f32 %f320, %f312, %f50; sub.f32 %f321, %f313, %f50; sub.f32 %f322, %f314, %f50; sub.f32 %f323, %f315, %f50; ld.v4.f32 {%f324, %f325, %f326, %f327}, [%rd196+32]; sub.f32 %f332, %f49, %f324; sub.f32 %f333, %f49, %f325; sub.f32 %f334, %f49, %f326; sub.f32 %f335, %f49, %f327; ld.v4.f32 {%f336, %f337, %f338, %f339}, [%rd196+48]; sub.f32 %f344, %f50, %f336; sub.f32 %f345, %f50, %f337; sub.f32 %f346, %f50, %f338; sub.f32 %f347, %f50, %f339; setp.ge.f32 %p90, %f308, %f332; selp.f32 %f348, %f308, %f332, %p90; setp.ge.f32 %p91, %f309, %f333; selp.f32 %f349, %f309, %f333, %p91; setp.ge.f32 %p92, %f310, %f334; selp.f32 %f350, %f310, %f334, %p92; setp.ge.f32 %p93, %f311, %f335; selp.f32 %f351, %f311, %f335, %p93; setp.ge.f32 %p94, %f320, %f344; selp.f32 %f352, %f320, %f344, %p94; setp.ge.f32 %p95, %f321, %f345; selp.f32 %f353, %f321, %f345, %p95; setp.ge.f32 %p96, %f322, %f346; selp.f32 %f354, %f322, %f346, %p96; setp.ge.f32 %p97, %f323, %f347; selp.f32 %f355, %f323, %f347, %p97; setp.ge.f32 %p98, %f348, 0f00000000; selp.f32 %f356, %f348, 0f00000000, %p98; setp.ge.f32 %p99, %f349, 0f00000000; selp.f32 %f357, %f349, 0f00000000, %p99; setp.ge.f32 %p100, %f350, 0f00000000; selp.f32 %f358, %f350, 0f00000000, %p100; setp.ge.f32 %p101, %f351, 0f00000000; selp.f32 %f359, %f351, 0f00000000, %p101; mov.b32 %r299, %f356; mov.b32 %r300, %f357; mov.b32 %r301, %f358; mov.b32 %r302, %f359; cvt.u64.u32 %rd824, %r302; cvt.u64.u32 %rd825, %r300; cvt.u64.u32 %rd826, %r299; cvt.u64.u32 %rd827, %r301; bfi.b64 %rd828, %rd824, %rd827, 32, 32; bfi.b64 %rd829, %rd825, %rd826, 32, 32; setp.ge.f32 %p102, %f352, 0f00000000; selp.f32 %f360, %f352, 0f00000000, %p102; setp.ge.f32 %p103, %f353, 0f00000000; selp.f32 %f361, %f353, 0f00000000, %p103; setp.ge.f32 %p104, %f354, 0f00000000; selp.f32 %f362, %f354, 0f00000000, %p104; setp.ge.f32 %p105, %f355, 0f00000000; selp.f32 %f363, %f355, 0f00000000, %p105; mov.b32 %r303, %f360; mov.b32 %r304, %f361; mov.b32 %r305, %f362; mov.b32 %r306, %f363; cvt.u64.u32 %rd830, %r306; cvt.u64.u32 %rd831, %r304; cvt.u64.u32 %rd832, %r303; cvt.u64.u32 %rd833, %r305; bfi.b64 %rd834, %rd830, %rd833, 32, 32; bfi.b64 %rd835, %rd831, %rd832, 32, 32; mov.b64 {%r307, %r308}, %rd829; mov.b64 {%r309, %r310}, %rd828; cvt.u64.u32 %rd836, %r310; cvt.u64.u32 %rd837, %r308; cvt.u64.u32 %rd838, %r309; bfi.b64 %rd839, %rd836, %rd838, 32, 32; mov.b64 {%r311, %r312}, %rd839; bfi.b64 %rd840, %rd837, %rd826, 32, 32; mov.b64 {%r313, %r314}, %rd840; mov.b32 %f364, %r313; mov.b32 %f365, %r314; mov.b32 %f366, %r311; mov.b32 %f367, %r312; mov.b32 %f368, %r307; mov.b32 %f369, %r308; mov.b32 %f370, %r309; mov.b32 %f371, %r310; mov.b64 {%r315, %r316}, %rd835; mov.b64 {%r317, %r318}, %rd834; cvt.u64.u32 %rd841, %r318; cvt.u64.u32 %rd842, %r316; cvt.u64.u32 %rd843, %r317; bfi.b64 %rd844, %rd841, %rd843, 32, 32; mov.b64 {%r319, %r320}, %rd844; bfi.b64 %rd845, %rd842, %rd832, 32, 32; mov.b64 {%r321, %r322}, %rd845; mov.b32 %f372, %r321; mov.b32 %f373, %r322; mov.b32 %f374, %r319; mov.b32 %f375, %r320; mov.b32 %f376, %r315; mov.b32 %f377, %r316; mov.b32 %f378, %r317; mov.b32 %f379, %r318; mul.f32 %f380, %f376, %f372; mul.f32 %f381, %f377, %f373; mul.f32 %f382, %f378, %f374; mul.f32 %f383, %f379, %f375; fma.rn.f32 %f384, %f368, %f364, %f380; fma.rn.f32 %f385, %f369, %f365, %f381; fma.rn.f32 %f386, %f370, %f366, %f382; fma.rn.f32 %f387, %f371, %f367, %f383; add.f32 %f388, %f384, 0f00000000; add.f32 %f389, %f385, 0f00000000; add.f32 %f390, %f386, 0f00000000; add.f32 %f391, %f387, 0f00000000; sqrt.rn.f32 %f392, %f388; sqrt.rn.f32 %f393, %f389; sqrt.rn.f32 %f394, %f390; sqrt.rn.f32 %f395, %f391; mov.b32 %r323, %f392; mov.b32 %r324, %f393; mov.b32 %r325, %f394; mov.b32 %r326, %f395; cvt.u64.u32 %rd846, %r326; cvt.u64.u32 %rd847, %r324; cvt.u64.u32 %rd848, %r323; cvt.u64.u32 %rd849, %r325; bfi.b64 %rd1523, %rd846, %rd849, 32, 32; mov.b64 {%r327, %r328}, %rd1523; bfi.b64 %rd1522, %rd847, %rd848, 32, 32; mov.b64 {%r329, %r330}, %rd1522; mov.b32 %f396, %r329; mov.b32 %f397, %r330; mov.b32 %f398, %r327; mov.b32 %f399, %r328; setp.lt.f32 %p106, %f396, %f641; setp.lt.f32 %p107, %f397, %f641; setp.lt.f32 %p108, %f398, %f641; setp.lt.f32 %p109, %f399, %f641; selp.u32 %r331, 1, 0, %p106; selp.u32 %r332, -1, 0, %p107; bfi.b32 %r333, %r332, %r331, 8, 1; selp.u32 %r334, -1, 0, %p108; bfi.b32 %r335, %r334, %r333, 16, 1; selp.u32 %r336, -1, 0, %p109; bfi.b32 %r337, %r336, %r335, 24, 1; cvt.u64.u32 %rd850, %r337; mov.b64 {%r338, %r339}, %rd850; mov.b32 {%rs37, %rs38}, %r338; and.b16 %rs39, %rs37, 1; shr.u16 %rs40, %rs37, 7; and.b16 %rs41, %rs40, 2; or.b16 %rs42, %rs41, %rs39; shl.b16 %rs43, %rs38, 2; and.b16 %rs44, %rs43, 4; or.b16 %rs45, %rs42, %rs44; shr.u16 %rs46, %rs38, 5; and.b16 %rs47, %rs46, 8; or.b16 %rs48, %rs45, %rs47; cvt.u64.u16 %rd207, %rs48; @%p345 bra $L__BB1_79; bra.uni $L__BB1_78; $L__BB1_79: mov.u64 %rd212, 1; st.local.v2.u64 [%rd15], {%rd1417, %rd1416}; st.local.v2.u64 [%rd15+16], {%rd1415, %rd1414}; mov.f32 %f400, 0f00000000; st.local.v4.f32 [%rd14], {%f400, %f400, %f400, %f400}; mov.u32 %r350, 4; st.local.u32 [%rd13+16], %r350; st.local.u32 [%rd13+52], %r350; st.local.u32 [%rd13+88], %r350; st.local.u32 [%rd13+124], %r350; $L__BB1_80: mov.u64 %rd1329, 1; add.s64 %rd852, %rd212, -1; cvt.u32.u64 %r351, %rd852; shl.b64 %rd854, %rd1329, %r351; and.b64 %rd855, %rd854, %rd207; setp.eq.s64 %p110, %rd855, 0; @%p110 bra $L__BB1_133; shl.b64 %rd856, %rd212, 3; add.s64 %rd857, %rd15, %rd856; ld.local.u64 %rd213, [%rd857+-8]; setp.eq.s64 %p111, %rd213, 0; @%p111 bra $L__BB1_133; ld.u32 %r46, [%rd213]; cvt.u64.u32 %rd214, %r46; ld.global.u64 %rd858, [%rd54+112]; setp.gt.u64 %p112, %rd858, %rd214; @%p112 bra $L__BB1_84; bra.uni $L__BB1_83; $L__BB1_84: ld.global.u64 %rd859, [%rd54+104]; mul.lo.s64 %rd860, %rd214, 12; add.s64 %rd215, %rd859, %rd860; ld.u32 %rd216, [%rd215+8]; ld.u32 %rd217, [%rd215]; ld.global.u64 %rd218, [%rd54+96]; setp.gt.u64 %p113, %rd218, %rd217; @%p113 bra $L__BB1_86; bra.uni $L__BB1_85; $L__BB1_86: ld.global.u64 %rd219, [%rd54+88]; shl.b64 %rd861, %rd217, 3; add.s64 %rd862, %rd219, %rd861; ld.u32 %rd863, [%rd862]; ld.u32 %rd864, [%rd862+4]; bfi.b64 %rd220, %rd864, %rd863, 32, 32; ld.u32 %rd221, [%rd215+4]; setp.gt.u64 %p114, %rd218, %rd221; @%p114 bra $L__BB1_88; bra.uni $L__BB1_87; $L__BB1_88: setp.gt.u64 %p115, %rd218, %rd216; @%p115 bra $L__BB1_90; bra.uni $L__BB1_89; $L__BB1_90: shl.b64 %rd865, %rd221, 3; add.s64 %rd866, %rd219, %rd865; shl.b64 %rd867, %rd216, 3; add.s64 %rd868, %rd219, %rd867; cvt.u32.u64 %r352, %rd220; mov.b32 %f52, %r352; shr.u64 %rd869, %rd220, 32; cvt.u32.u64 %r353, %rd869; mov.b32 %f53, %r353; ld.u32 %rd870, [%rd866]; ld.u32 %rd871, [%rd866+4]; bfi.b64 %rd222, %rd871, %rd870, 32, 32; cvt.u32.u64 %r354, %rd222; shr.u64 %rd872, %rd222, 32; cvt.u32.u64 %r355, %rd872; mov.b32 %f54, %r354; sub.f32 %f55, %f54, %f52; mov.b32 %f707, %r355; sub.f32 %f57, %f707, %f53; ld.u32 %rd873, [%rd868]; ld.u32 %rd874, [%rd868+4]; bfi.b64 %rd223, %rd874, %rd873, 32, 32; cvt.u32.u64 %r356, %rd223; shr.u64 %rd875, %rd223, 32; cvt.u32.u64 %r357, %rd875; mov.b32 %f58, %r356; sub.f32 %f59, %f58, %f52; mov.b32 %f60, %r357; sub.f32 %f61, %f60, %f53; sub.f32 %f62, %f49, %f52; sub.f32 %f63, %f50, %f53; mul.f32 %f401, %f57, %f63; fma.rn.f32 %f64, %f55, %f62, %f401; mul.f32 %f402, %f61, %f63; fma.rn.f32 %f65, %f59, %f62, %f402; setp.le.f32 %p116, %f64, 0f00000000; setp.le.f32 %p117, %f65, 0f00000000; and.pred %p118, %p116, %p117; @%p118 bra $L__BB1_128; bra.uni $L__BB1_91; $L__BB1_128: add.u64 %rd1512, %SPL, 0; add.u64 %rd1508, %SP, 728; add.u64 %rd1514, %SP, 0; st.local.u64 [%rd1512], %rd220; mov.u64 %rd1519, 2; mov.u64 %rd1505, %rd32; mov.u64 %rd1506, %rd17; mov.u64 %rd1507, %rd17; mov.u64 %rd1509, %rd17; mov.u64 %rd1510, %rd17; mov.u64 %rd1511, %rd1508; mov.u64 %rd1513, %rd1512; mov.u64 %rd1515, %rd1512; mov.u64 %rd1516, %rd1512; mov.u64 %rd1517, %rd1514; mov.u64 %rd1518, %rd26; $L__BB1_129: setp.eq.s64 %p171, %rd1519, 0; mov.u64 %rd1520, 1; @%p171 bra $L__BB1_131; add.s64 %rd1519, %rd1519, -1; add.s64 %rd1020, %rd1506, 8; setp.eq.s64 %p172, %rd1509, %rd1505; selp.b64 %rd1021, %rd1020, %rd1509, %p172; add.s64 %rd1022, %rd1507, 8; selp.b64 %rd1023, %rd1022, %rd1510, %p172; add.s64 %rd1024, %rd1508, 8; selp.b64 %rd1025, %rd1024, %rd1511, %p172; mov.u64 %rd1520, 0; setp.eq.s64 %p173, %rd1519, 0; add.s64 %rd1026, %rd1021, 4; add.s64 %rd1027, %rd1023, 4; add.s64 %rd1028, %rd1025, 4; selp.b64 %rd449, %rd1021, %rd1026, %p173; selp.b64 %rd1510, %rd1023, %rd1027, %p173; selp.b64 %rd1511, %rd1025, %rd1028, %p173; selp.b64 %rd1506, %rd1020, %rd1506, %p172; selp.b64 %rd1507, %rd1022, %rd1507, %p172; selp.b64 %rd1508, %rd1024, %rd1508, %p172; add.s64 %rd1029, %rd1509, 8; selp.b64 %rd1505, %rd1029, %rd1505, %p172; add.s64 %rd1030, %rd1515, 8; setp.eq.s64 %p174, %rd1512, %rd1518; selp.b64 %rd1031, %rd1030, %rd1512, %p174; add.s64 %rd1032, %rd1516, 8; selp.b64 %rd1033, %rd1032, %rd1513, %p174; add.s64 %rd1034, %rd1517, 8; selp.b64 %rd1035, %rd1034, %rd1514, %p174; selp.b64 %rd1515, %rd1030, %rd1515, %p174; selp.b64 %rd1516, %rd1032, %rd1516, %p174; selp.b64 %rd1517, %rd1034, %rd1517, %p174; add.s64 %rd1036, %rd1512, 8; selp.b64 %rd1518, %rd1036, %rd1518, %p174; add.s64 %rd1037, %rd1031, 4; add.s64 %rd1038, %rd1033, 4; add.s64 %rd1039, %rd1035, 4; selp.b64 %rd1512, %rd1031, %rd1037, %p173; selp.b64 %rd1513, %rd1033, %rd1038, %p173; selp.b64 %rd1514, %rd1035, %rd1039, %p173; ld.local.f32 %f468, [%rd1033]; ld.local.f32 %f469, [%rd1023]; setp.eq.f32 %p175, %f469, %f468; mov.u64 %rd1509, %rd449; @%p175 bra $L__BB1_129; $L__BB1_131: cvt.u32.u64 %r537, %rd220; mov.u64 %rd1274, 0; or.b64 %rd1041, %rd1274, %rd220; mov.b64 {%r399, %r400}, %rd1041; mov.b64 {%r401, %r402}, %rd1520; cvt.u32.u64 %r404, %rd1274; or.b32 %r606, %r404, %r537; mov.u32 %r607, 0; mov.b32 %f711, %r400; mov.b32 {%rs118, %rs67}, %r401; mov.u32 %r608, %r607; bra.uni $L__BB1_132; $L__BB1_91: cvt.u32.u64 %r559, %rd222; mov.b32 %f653, %r559; sub.f32 %f66, %f49, %f653; sub.f32 %f67, %f50, %f707; mul.f32 %f403, %f57, %f67; fma.rn.f32 %f68, %f55, %f66, %f403; mul.f32 %f404, %f61, %f67; fma.rn.f32 %f69, %f59, %f66, %f404; setp.ge.f32 %p119, %f68, 0f00000000; setp.le.f32 %p120, %f69, %f68; and.pred %p121, %p120, %p119; @%p121 bra $L__BB1_124; bra.uni $L__BB1_92; $L__BB1_124: add.u64 %rd1496, %SPL, 0; add.u64 %rd1492, %SP, 728; add.u64 %rd1498, %SP, 0; st.local.u64 [%rd1496], %rd222; mov.u64 %rd1503, 2; mov.u64 %rd1489, %rd32; mov.u64 %rd1490, %rd17; mov.u64 %rd1491, %rd17; mov.u64 %rd1493, %rd17; mov.u64 %rd1494, %rd17; mov.u64 %rd1495, %rd1492; mov.u64 %rd1497, %rd1496; mov.u64 %rd1499, %rd1496; mov.u64 %rd1500, %rd1496; mov.u64 %rd1501, %rd1498; mov.u64 %rd1502, %rd27; $L__BB1_125: setp.eq.s64 %p166, %rd1503, 0; mov.u64 %rd1504, 1; @%p166 bra $L__BB1_127; add.s64 %rd1503, %rd1503, -1; add.s64 %rd993, %rd1490, 8; setp.eq.s64 %p167, %rd1493, %rd1489; selp.b64 %rd994, %rd993, %rd1493, %p167; add.s64 %rd995, %rd1491, 8; selp.b64 %rd996, %rd995, %rd1494, %p167; add.s64 %rd997, %rd1492, 8; selp.b64 %rd998, %rd997, %rd1495, %p167; mov.u64 %rd1504, 0; setp.eq.s64 %p168, %rd1503, 0; add.s64 %rd999, %rd994, 4; add.s64 %rd1000, %rd996, 4; add.s64 %rd1001, %rd998, 4; selp.b64 %rd411, %rd994, %rd999, %p168; selp.b64 %rd1494, %rd996, %rd1000, %p168; selp.b64 %rd1495, %rd998, %rd1001, %p168; selp.b64 %rd1490, %rd993, %rd1490, %p167; selp.b64 %rd1491, %rd995, %rd1491, %p167; selp.b64 %rd1492, %rd997, %rd1492, %p167; add.s64 %rd1002, %rd1493, 8; selp.b64 %rd1489, %rd1002, %rd1489, %p167; add.s64 %rd1003, %rd1499, 8; setp.eq.s64 %p169, %rd1496, %rd1502; selp.b64 %rd1004, %rd1003, %rd1496, %p169; add.s64 %rd1005, %rd1500, 8; selp.b64 %rd1006, %rd1005, %rd1497, %p169; add.s64 %rd1007, %rd1501, 8; selp.b64 %rd1008, %rd1007, %rd1498, %p169; selp.b64 %rd1499, %rd1003, %rd1499, %p169; selp.b64 %rd1500, %rd1005, %rd1500, %p169; selp.b64 %rd1501, %rd1007, %rd1501, %p169; add.s64 %rd1009, %rd1496, 8; selp.b64 %rd1502, %rd1009, %rd1502, %p169; add.s64 %rd1010, %rd1004, 4; add.s64 %rd1011, %rd1006, 4; add.s64 %rd1012, %rd1008, 4; selp.b64 %rd1496, %rd1004, %rd1010, %p168; selp.b64 %rd1497, %rd1006, %rd1011, %p168; selp.b64 %rd1498, %rd1008, %rd1012, %p168; ld.local.f32 %f466, [%rd1006]; ld.local.f32 %f467, [%rd996]; setp.eq.f32 %p170, %f467, %f466; mov.u64 %rd1493, %rd411; @%p170 bra $L__BB1_125; $L__BB1_127: cvt.u32.u64 %r536, %rd222; mov.u64 %rd1273, 0; or.b64 %rd1014, %rd1273, %rd222; mov.b64 {%r391, %r392}, %rd1014; mov.b64 {%r393, %r394}, %rd1504; cvt.u32.u64 %r396, %rd1273; or.b32 %r606, %r396, %r536; mov.u32 %r607, 0; mov.b32 %f711, %r392; mov.u32 %r608, 1; mov.b32 {%rs118, %rs63}, %r393; bra.uni $L__BB1_132; $L__BB1_92: shr.u64 %rd1340, %rd223, 32; cvt.u32.u64 %r539, %rd1340; mov.b32 %f646, %r539; cvt.u32.u64 %r538, %rd223; mov.b32 %f645, %r538; sub.f32 %f70, %f49, %f645; sub.f32 %f71, %f50, %f646; mul.f32 %f405, %f57, %f71; fma.rn.f32 %f72, %f55, %f70, %f405; mul.f32 %f406, %f61, %f71; fma.rn.f32 %f73, %f59, %f70, %f406; setp.ge.f32 %p122, %f73, 0f00000000; setp.le.f32 %p123, %f72, %f73; and.pred %p124, %p123, %p122; @%p124 bra $L__BB1_120; bra.uni $L__BB1_93; $L__BB1_120: add.u64 %rd1480, %SPL, 0; add.u64 %rd1476, %SP, 728; add.u64 %rd1482, %SP, 0; st.local.u64 [%rd1480], %rd223; mov.u64 %rd1487, 2; mov.u64 %rd1473, %rd32; mov.u64 %rd1474, %rd17; mov.u64 %rd1475, %rd17; mov.u64 %rd1477, %rd17; mov.u64 %rd1478, %rd17; mov.u64 %rd1479, %rd1476; mov.u64 %rd1481, %rd1480; mov.u64 %rd1483, %rd1480; mov.u64 %rd1484, %rd1480; mov.u64 %rd1485, %rd1482; mov.u64 %rd1486, %rd28; $L__BB1_121: setp.eq.s64 %p161, %rd1487, 0; mov.u64 %rd1488, 1; @%p161 bra $L__BB1_123; add.s64 %rd1487, %rd1487, -1; add.s64 %rd966, %rd1474, 8; setp.eq.s64 %p162, %rd1477, %rd1473; selp.b64 %rd967, %rd966, %rd1477, %p162; add.s64 %rd968, %rd1475, 8; selp.b64 %rd969, %rd968, %rd1478, %p162; add.s64 %rd970, %rd1476, 8; selp.b64 %rd971, %rd970, %rd1479, %p162; mov.u64 %rd1488, 0; setp.eq.s64 %p163, %rd1487, 0; add.s64 %rd972, %rd967, 4; add.s64 %rd973, %rd969, 4; add.s64 %rd974, %rd971, 4; selp.b64 %rd373, %rd967, %rd972, %p163; selp.b64 %rd1478, %rd969, %rd973, %p163; selp.b64 %rd1479, %rd971, %rd974, %p163; selp.b64 %rd1474, %rd966, %rd1474, %p162; selp.b64 %rd1475, %rd968, %rd1475, %p162; selp.b64 %rd1476, %rd970, %rd1476, %p162; add.s64 %rd975, %rd1477, 8; selp.b64 %rd1473, %rd975, %rd1473, %p162; add.s64 %rd976, %rd1483, 8; setp.eq.s64 %p164, %rd1480, %rd1486; selp.b64 %rd977, %rd976, %rd1480, %p164; add.s64 %rd978, %rd1484, 8; selp.b64 %rd979, %rd978, %rd1481, %p164; add.s64 %rd980, %rd1485, 8; selp.b64 %rd981, %rd980, %rd1482, %p164; selp.b64 %rd1483, %rd976, %rd1483, %p164; selp.b64 %rd1484, %rd978, %rd1484, %p164; selp.b64 %rd1485, %rd980, %rd1485, %p164; add.s64 %rd982, %rd1480, 8; selp.b64 %rd1486, %rd982, %rd1486, %p164; add.s64 %rd983, %rd977, 4; add.s64 %rd984, %rd979, 4; add.s64 %rd985, %rd981, 4; selp.b64 %rd1480, %rd977, %rd983, %p163; selp.b64 %rd1481, %rd979, %rd984, %p163; selp.b64 %rd1482, %rd981, %rd985, %p163; ld.local.f32 %f464, [%rd979]; ld.local.f32 %f465, [%rd969]; setp.eq.f32 %p165, %f465, %f464; mov.u64 %rd1477, %rd373; @%p165 bra $L__BB1_121; $L__BB1_123: cvt.u32.u64 %r535, %rd223; mov.u64 %rd1272, 0; or.b64 %rd987, %rd1272, %rd223; mov.b64 {%r383, %r384}, %rd987; mov.b64 {%r385, %r386}, %rd1488; cvt.u32.u64 %r388, %rd1272; or.b32 %r606, %r388, %r535; mov.u32 %r607, 0; mov.b32 %f711, %r384; mov.b32 {%rs118, %rs59}, %r385; mov.u32 %r608, 2; bra.uni $L__BB1_132; $L__BB1_93: cvt.u32.u64 %r580, %rd220; mov.b32 %f679, %r580; sub.f32 %f678, %f49, %f679; shr.u64 %rd1342, %rd220, 32; cvt.u32.u64 %r543, %rd1342; mov.b32 %f651, %r543; sub.f32 %f650, %f50, %f651; shr.u64 %rd1341, %rd223, 32; cvt.u32.u64 %r542, %rd1341; mov.b32 %f649, %r542; cvt.u32.u64 %r541, %rd223; mov.b32 %f648, %r541; cvt.u32.u64 %r540, %rd222; mov.b32 %f647, %r540; sub.f32 %f74, %f648, %f647; sub.f32 %f75, %f649, %f707; mul.f32 %f407, %f57, %f59; mul.f32 %f408, %f55, %f61; sub.f32 %f76, %f408, %f407; mul.f32 %f409, %f57, %f678; mul.f32 %f410, %f55, %f650; sub.f32 %f411, %f410, %f409; mul.f32 %f412, %f76, %f411; setp.lt.f32 %p125, %f412, 0f00000000; setp.ge.f32 %p126, %f64, 0f00000000; and.pred %p127, %p126, %p125; setp.le.f32 %p128, %f68, 0f00000000; and.pred %p129, %p128, %p127; mov.u16 %rs117, 0; @%p129 bra $L__BB1_96; cvt.u32.u64 %r568, %rd223; mov.b32 %f665, %r568; sub.f32 %f664, %f49, %f665; shr.u64 %rd1353, %rd223, 32; cvt.u32.u64 %r567, %rd1353; mov.b32 %f663, %r567; sub.f32 %f662, %f50, %f663; mul.f32 %f413, %f59, %f662; mul.f32 %f414, %f664, %f61; sub.f32 %f415, %f413, %f414; mul.f32 %f416, %f76, %f415; setp.gt.f32 %p130, %f416, 0f80000000; setp.ge.f32 %p131, %f65, 0f00000000; and.pred %p132, %p131, %p130; setp.le.f32 %p133, %f73, 0f00000000; and.pred %p134, %p133, %p132; mov.u16 %rs117, 1; @%p134 bra $L__BB1_96; mul.f32 %f417, %f74, %f67; mul.f32 %f418, %f66, %f75; sub.f32 %f419, %f417, %f418; mul.f32 %f420, %f76, %f419; setp.lt.f32 %p135, %f420, 0f00000000; sub.f32 %f421, %f69, %f68; setp.ge.f32 %p136, %f421, 0f00000000; and.pred %p137, %p136, %p135; sub.f32 %f422, %f72, %f73; setp.ge.f32 %p138, %f422, 0f00000000; and.pred %p139, %p138, %p137; selp.b16 %rs117, 2, 3, %p139; $L__BB1_96: mul.f32 %f423, %f57, %f57; fma.rn.f32 %f424, %f55, %f55, %f423; add.f32 %f77, %f424, 0f00000000; mul.f32 %f425, %f61, %f61; fma.rn.f32 %f426, %f59, %f59, %f425; add.f32 %f78, %f426, 0f00000000; mul.f32 %f427, %f75, %f75; fma.rn.f32 %f428, %f74, %f74, %f427; add.f32 %f79, %f428, 0f00000000; setp.eq.s16 %p140, %rs117, 1; @%p140 bra $L__BB1_111; setp.eq.s16 %p141, %rs117, 2; @%p141 bra $L__BB1_107; setp.ne.s16 %p142, %rs117, 3; @%p142 bra $L__BB1_115; cvt.u32.u64 %r581, %rd220; mov.b32 %f681, %r581; sub.f32 %f680, %f49, %f681; shr.u64 %rd1352, %rd220, 32; cvt.u32.u64 %r560, %rd1352; mov.b32 %f655, %r560; sub.f32 %f654, %f50, %f655; sub.f32 %f429, %f64, %f68; div.rn.f32 %f80, %f64, %f429; sub.f32 %f430, %f65, %f73; div.rn.f32 %f81, %f65, %f430; sub.f32 %f431, %f69, %f68; add.f32 %f432, %f72, %f431; sub.f32 %f433, %f432, %f73; div.rn.f32 %f709, %f431, %f433; mul.f32 %f434, %f654, %f654; fma.rn.f32 %f435, %f680, %f680, %f434; add.f32 %f436, %f435, 0f00000000; mul.f32 %f437, %f77, %f80; mul.f32 %f438, %f80, %f437; sub.f32 %f83, %f436, %f438; mul.f32 %f439, %f78, %f709; mul.f32 %f440, %f709, %f439; sub.f32 %f84, %f436, %f440; mul.f32 %f441, %f67, %f67; fma.rn.f32 %f442, %f66, %f66, %f441; add.f32 %f443, %f442, 0f00000000; mul.f32 %f444, %f79, %f81; mul.f32 %f445, %f81, %f444; sub.f32 %f85, %f443, %f445; setp.lt.f32 %p143, %f83, %f84; @%p143 bra $L__BB1_103; bra.uni $L__BB1_100; $L__BB1_103: setp.lt.f32 %p145, %f83, %f85; @%p145 bra $L__BB1_105; bra.uni $L__BB1_104; $L__BB1_105: cvt.u32.u64 %r564, %rd220; mov.b32 %f659, %r564; mul.f32 %f708, %f57, %f80; fma.rn.f32 %f706, %f55, %f80, %f659; mov.u32 %r608, 0; mov.f32 %f707, %f53; mov.f32 %f709, %f80; bra.uni $L__BB1_106; $L__BB1_111: cvt.u32.u64 %r566, %rd220; mov.b32 %f661, %r566; add.u64 %rd1446, %SPL, 0; add.u64 %rd1442, %SP, 728; add.u64 %rd1448, %SP, 0; div.rn.f32 %f710, %f65, %f78; fma.rn.f32 %f454, %f59, %f710, %f661; mov.b32 %r368, %f454; fma.rn.f32 %f455, %f61, %f710, %f53; mov.b32 %r369, %f455; cvt.u64.u32 %rd906, %r369; cvt.u64.u32 %rd907, %r368; bfi.b64 %rd272, %rd906, %rd907, 32, 32; st.local.u64 [%rd1446], %rd272; mov.u64 %rd1453, 2; mov.u64 %rd1439, %rd32; mov.u64 %rd1440, %rd17; mov.u64 %rd1441, %rd17; mov.u64 %rd1443, %rd17; mov.u64 %rd1444, %rd17; mov.u64 %rd1445, %rd1442; mov.u64 %rd1447, %rd1446; mov.u64 %rd1449, %rd1446; mov.u64 %rd1450, %rd1446; mov.u64 %rd1451, %rd1448; mov.u64 %rd1452, %rd30; $L__BB1_112: setp.eq.s64 %p151, %rd1453, 0; mov.u64 %rd1472, 1; @%p151 bra $L__BB1_114; add.s64 %rd1453, %rd1453, -1; add.s64 %rd912, %rd1440, 8; setp.eq.s64 %p152, %rd1443, %rd1439; selp.b64 %rd913, %rd912, %rd1443, %p152; add.s64 %rd914, %rd1441, 8; selp.b64 %rd915, %rd914, %rd1444, %p152; add.s64 %rd916, %rd1442, 8; selp.b64 %rd917, %rd916, %rd1445, %p152; mov.u64 %rd1472, 0; setp.eq.s64 %p153, %rd1453, 0; add.s64 %rd918, %rd913, 4; add.s64 %rd919, %rd915, 4; add.s64 %rd920, %rd917, 4; selp.b64 %rd289, %rd913, %rd918, %p153; selp.b64 %rd1444, %rd915, %rd919, %p153; selp.b64 %rd1445, %rd917, %rd920, %p153; selp.b64 %rd1440, %rd912, %rd1440, %p152; selp.b64 %rd1441, %rd914, %rd1441, %p152; selp.b64 %rd1442, %rd916, %rd1442, %p152; add.s64 %rd921, %rd1443, 8; selp.b64 %rd1439, %rd921, %rd1439, %p152; add.s64 %rd922, %rd1449, 8; setp.eq.s64 %p154, %rd1446, %rd1452; selp.b64 %rd923, %rd922, %rd1446, %p154; add.s64 %rd924, %rd1450, 8; selp.b64 %rd925, %rd924, %rd1447, %p154; add.s64 %rd926, %rd1451, 8; selp.b64 %rd927, %rd926, %rd1448, %p154; selp.b64 %rd1449, %rd922, %rd1449, %p154; selp.b64 %rd1450, %rd924, %rd1450, %p154; selp.b64 %rd1451, %rd926, %rd1451, %p154; add.s64 %rd928, %rd1446, 8; selp.b64 %rd1452, %rd928, %rd1452, %p154; add.s64 %rd929, %rd923, 4; add.s64 %rd930, %rd925, 4; add.s64 %rd931, %rd927, 4; selp.b64 %rd1446, %rd923, %rd929, %p153; selp.b64 %rd1447, %rd925, %rd930, %p153; selp.b64 %rd1448, %rd927, %rd931, %p153; ld.local.f32 %f456, [%rd925]; ld.local.f32 %f457, [%rd915]; setp.eq.f32 %p155, %f457, %f456; mov.u64 %rd1443, %rd289; @%p155 bra $L__BB1_112; $L__BB1_114: mov.u64 %rd1270, 0; or.b64 %rd1471, %rd1270, %rd272; mov.u32 %r608, 2; bra.uni $L__BB1_119; $L__BB1_107: cvt.u32.u64 %r565, %rd222; mov.b32 %f660, %r565; add.u64 %rd1430, %SPL, 0; add.u64 %rd1426, %SP, 728; add.u64 %rd1432, %SP, 0; mul.f32 %f448, %f75, %f67; fma.rn.f32 %f449, %f74, %f66, %f448; div.rn.f32 %f710, %f449, %f79; fma.rn.f32 %f450, %f74, %f710, %f660; mov.b32 %r365, %f450; fma.rn.f32 %f451, %f75, %f710, %f707; mov.b32 %r366, %f451; cvt.u64.u32 %rd879, %r366; cvt.u64.u32 %rd880, %r365; bfi.b64 %rd231, %rd879, %rd880, 32, 32; st.local.u64 [%rd1430], %rd231; mov.u64 %rd1437, 2; mov.u64 %rd1423, %rd32; mov.u64 %rd1424, %rd17; mov.u64 %rd1425, %rd17; mov.u64 %rd1427, %rd17; mov.u64 %rd1428, %rd17; mov.u64 %rd1429, %rd1426; mov.u64 %rd1431, %rd1430; mov.u64 %rd1433, %rd1430; mov.u64 %rd1434, %rd1430; mov.u64 %rd1435, %rd1432; mov.u64 %rd1436, %rd31; $L__BB1_108: setp.eq.s64 %p146, %rd1437, 0; mov.u64 %rd1472, 1; @%p146 bra $L__BB1_110; add.s64 %rd1437, %rd1437, -1; add.s64 %rd885, %rd1424, 8; setp.eq.s64 %p147, %rd1427, %rd1423; selp.b64 %rd886, %rd885, %rd1427, %p147; add.s64 %rd887, %rd1425, 8; selp.b64 %rd888, %rd887, %rd1428, %p147; add.s64 %rd889, %rd1426, 8; selp.b64 %rd890, %rd889, %rd1429, %p147; mov.u64 %rd1472, 0; setp.eq.s64 %p148, %rd1437, 0; add.s64 %rd891, %rd886, 4; add.s64 %rd892, %rd888, 4; add.s64 %rd893, %rd890, 4; selp.b64 %rd248, %rd886, %rd891, %p148; selp.b64 %rd1428, %rd888, %rd892, %p148; selp.b64 %rd1429, %rd890, %rd893, %p148; selp.b64 %rd1424, %rd885, %rd1424, %p147; selp.b64 %rd1425, %rd887, %rd1425, %p147; selp.b64 %rd1426, %rd889, %rd1426, %p147; add.s64 %rd894, %rd1427, 8; selp.b64 %rd1423, %rd894, %rd1423, %p147; add.s64 %rd895, %rd1433, 8; setp.eq.s64 %p149, %rd1430, %rd1436; selp.b64 %rd896, %rd895, %rd1430, %p149; add.s64 %rd897, %rd1434, 8; selp.b64 %rd898, %rd897, %rd1431, %p149; add.s64 %rd899, %rd1435, 8; selp.b64 %rd900, %rd899, %rd1432, %p149; selp.b64 %rd1433, %rd895, %rd1433, %p149; selp.b64 %rd1434, %rd897, %rd1434, %p149; selp.b64 %rd1435, %rd899, %rd1435, %p149; add.s64 %rd901, %rd1430, 8; selp.b64 %rd1436, %rd901, %rd1436, %p149; add.s64 %rd902, %rd896, 4; add.s64 %rd903, %rd898, 4; add.s64 %rd904, %rd900, 4; selp.b64 %rd1430, %rd896, %rd902, %p148; selp.b64 %rd1431, %rd898, %rd903, %p148; selp.b64 %rd1432, %rd900, %rd904, %p148; ld.local.f32 %f452, [%rd898]; ld.local.f32 %f453, [%rd888]; setp.eq.f32 %p150, %f453, %f452; mov.u64 %rd1427, %rd248; @%p150 bra $L__BB1_108; $L__BB1_110: mov.u64 %rd1269, 0; or.b64 %rd1471, %rd1269, %rd231; mov.u32 %r608, 1; bra.uni $L__BB1_119; $L__BB1_115: cvt.u32.u64 %r544, %rd220; mov.b32 %f652, %r544; add.u64 %rd1304, %SP, 0; add.u64 %rd1458, %SP, 728; add.u64 %rd1464, %SP, 0; cvta.to.local.u64 %rd1462, %rd1464; div.rn.f32 %f710, %f64, %f77; fma.rn.f32 %f458, %f55, %f710, %f652; mov.b32 %r371, %f458; fma.rn.f32 %f459, %f57, %f710, %f53; mov.b32 %r372, %f459; cvt.u64.u32 %rd933, %r372; cvt.u64.u32 %rd934, %r371; bfi.b64 %rd313, %rd933, %rd934, 32, 32; st.local.u64 [%rd1462], %rd313; mov.u64 %rd1469, 2; mov.u64 %rd1455, %rd32; mov.u64 %rd1456, %rd17; mov.u64 %rd1457, %rd17; mov.u64 %rd1459, %rd17; mov.u64 %rd1460, %rd17; mov.u64 %rd1461, %rd1458; mov.u64 %rd1463, %rd1462; mov.u64 %rd1465, %rd1462; mov.u64 %rd1466, %rd1462; mov.u64 %rd1467, %rd1464; mov.u64 %rd1468, %rd29; $L__BB1_116: setp.eq.s64 %p156, %rd1469, 0; mov.u64 %rd1472, 1; @%p156 bra $L__BB1_118; add.s64 %rd1469, %rd1469, -1; add.s64 %rd939, %rd1456, 8; setp.eq.s64 %p157, %rd1459, %rd1455; selp.b64 %rd940, %rd939, %rd1459, %p157; add.s64 %rd941, %rd1457, 8; selp.b64 %rd942, %rd941, %rd1460, %p157; add.s64 %rd943, %rd1458, 8; selp.b64 %rd944, %rd943, %rd1461, %p157; mov.u64 %rd1472, 0; setp.eq.s64 %p158, %rd1469, 0; add.s64 %rd945, %rd940, 4; add.s64 %rd946, %rd942, 4; add.s64 %rd947, %rd944, 4; selp.b64 %rd330, %rd940, %rd945, %p158; selp.b64 %rd1460, %rd942, %rd946, %p158; selp.b64 %rd1461, %rd944, %rd947, %p158; selp.b64 %rd1456, %rd939, %rd1456, %p157; selp.b64 %rd1457, %rd941, %rd1457, %p157; selp.b64 %rd1458, %rd943, %rd1458, %p157; add.s64 %rd948, %rd1459, 8; selp.b64 %rd1455, %rd948, %rd1455, %p157; add.s64 %rd949, %rd1465, 8; setp.eq.s64 %p159, %rd1462, %rd1468; selp.b64 %rd950, %rd949, %rd1462, %p159; add.s64 %rd951, %rd1466, 8; selp.b64 %rd952, %rd951, %rd1463, %p159; add.s64 %rd953, %rd1467, 8; selp.b64 %rd954, %rd953, %rd1464, %p159; selp.b64 %rd1465, %rd949, %rd1465, %p159; selp.b64 %rd1466, %rd951, %rd1466, %p159; selp.b64 %rd1467, %rd953, %rd1467, %p159; add.s64 %rd955, %rd1462, 8; selp.b64 %rd1468, %rd955, %rd1468, %p159; add.s64 %rd956, %rd950, 4; add.s64 %rd957, %rd952, 4; add.s64 %rd958, %rd954, 4; selp.b64 %rd1462, %rd950, %rd956, %p158; selp.b64 %rd1463, %rd952, %rd957, %p158; selp.b64 %rd1464, %rd954, %rd958, %p158; ld.local.f32 %f460, [%rd952]; ld.local.f32 %f461, [%rd942]; setp.eq.f32 %p160, %f461, %f460; mov.u64 %rd1459, %rd330; @%p160 bra $L__BB1_116; $L__BB1_118: mov.u32 %r608, 0; mov.u64 %rd1271, 0; or.b64 %rd1471, %rd1271, %rd313; $L__BB1_119: mov.f32 %f462, 0f3F800000; sub.f32 %f463, %f462, %f710; mov.b32 %r375, %f463; mov.b32 %r376, %f710; cvt.u64.u32 %rd959, %r376; cvt.u64.u32 %rd960, %r375; bfi.b64 %rd1521, %rd959, %rd960, 32, 32; mov.b64 {%r377, %r378}, %rd1472; mov.b64 {%r379, %r380}, %rd1471; cvt.u32.u64 %r606, %rd1471; mov.b32 %f711, %r380; mov.u32 %r607, 1; mov.b32 {%rs118, %rs55}, %r377; bra.uni $L__BB1_132; $L__BB1_100: setp.lt.f32 %p144, %f84, %f85; @%p144 bra $L__BB1_102; bra.uni $L__BB1_101; $L__BB1_102: cvt.u32.u64 %r562, %rd220; mov.b32 %f657, %r562; mul.f32 %f708, %f61, %f81; fma.rn.f32 %f706, %f59, %f81, %f657; mov.u32 %r608, 2; mov.f32 %f707, %f53; mov.f32 %f709, %f81; bra.uni $L__BB1_106; $L__BB1_104: cvt.u32.u64 %r563, %rd222; mov.b32 %f658, %r563; mul.f32 %f708, %f75, %f709; fma.rn.f32 %f706, %f74, %f709, %f658; mov.u32 %r608, 1; bra.uni $L__BB1_106; $L__BB1_101: cvt.u32.u64 %r561, %rd222; mov.b32 %f656, %r561; mul.f32 %f708, %f75, %f709; fma.rn.f32 %f706, %f74, %f709, %f656; mov.u32 %r608, 1; $L__BB1_106: add.f32 %f711, %f707, %f708; mov.f32 %f446, 0f3F800000; sub.f32 %f447, %f446, %f709; mov.b32 %r363, %f447; mov.b32 %r364, %f709; cvt.u64.u32 %rd876, %r364; cvt.u64.u32 %rd877, %r363; bfi.b64 %rd1521, %rd876, %rd877, 32, 32; mov.b32 %r606, %f706; mov.u32 %r607, 1; mov.u16 %rs118, 1; $L__BB1_132: mov.b32 %f470, %r606; sub.f32 %f471, %f470, %f49; sub.f32 %f472, %f711, %f50; mul.f32 %f473, %f472, %f472; fma.rn.f32 %f474, %f471, %f471, %f473; add.f32 %f475, %f474, 0f00000000; sqrt.rn.f32 %f476, %f475; shl.b64 %rd1042, %rd212, 2; add.s64 %rd1043, %rd14, %rd1042; st.local.f32 [%rd1043+-4], %f476; mul.lo.s64 %rd1044, %rd212, 36; add.s64 %rd1045, %rd13, %rd1044; st.local.u32 [%rd1045+-36], %r606; st.local.f32 [%rd1045+-32], %f711; mov.u16 %rs68, 0; st.local.v4.u8 [%rd1045+-28], {%rs118, %rs68, %rs68, %rs68}; st.local.u32 [%rd1045+-24], %r46; st.local.u32 [%rd1045+-20], %r607; st.local.u32 [%rd1045+-16], %r608; shr.u64 %rd1046, %rd1521, 32; st.local.u32 [%rd1045+-8], %rd1046; st.local.u32 [%rd1045+-12], %rd1521; $L__BB1_133: setp.lt.u64 %p176, %rd212, 4; add.s64 %rd212, %rd212, 1; @%p176 bra $L__BB1_80; ld.local.v2.u64 {%rd1522, %rd1523}, [%rd14]; ld.local.v4.u32 {%r618, %r619, %r620, %r408}, [%rd13]; ld.local.u32 %r621, [%rd13+16]; ld.local.u32 %rd1049, [%rd13+36]; ld.local.u32 %rd1050, [%rd13+40]; bfi.b64 %rd1051, %rd1050, %rd1049, 32, 32; mov.b64 {%r615, %r616}, %rd1051; ld.local.u32 %r617, [%rd13+44]; ld.local.u32 %r622, [%rd13+52]; ld.local.u32 %r614, [%rd13+80]; ld.local.u64 %rd1052, [%rd13+72]; mov.b64 {%r612, %r613}, %rd1052; ld.local.u32 %r623, [%rd13+88]; ld.local.u32 %rd1053, [%rd13+108]; ld.local.u32 %rd1054, [%rd13+112]; bfi.b64 %rd1055, %rd1054, %rd1053, 32, 32; mov.b64 {%r609, %r610}, %rd1055; ld.local.u32 %r611, [%rd13+116]; ld.local.u32 %r624, [%rd13+124]; bra.uni $L__BB1_135; $L__BB1_78: mov.u32 %r621, 4; mov.u32 %r622, %r621; mov.u32 %r623, %r621; mov.u32 %r624, %r621; $L__BB1_135: and.b64 %rd1056, %rd207, 1; setp.eq.b64 %p177, %rd1056, 1; mov.pred %p178, 0; xor.pred %p179, %p177, %p178; not.pred %p180, %p179; mov.b64 {%r89, %r90}, %rd1522; mov.b64 {%r91, %r92}, %rd1523; @%p180 bra $L__BB1_144; bra.uni $L__BB1_136; $L__BB1_144: and.b64 %rd1072, %rd207, 2; setp.eq.s64 %p194, %rd1072, 0; @%p194 bra $L__BB1_153; bra.uni $L__BB1_145; $L__BB1_153: and.b64 %rd1088, %rd207, 4; setp.eq.s64 %p208, %rd1088, 0; @%p208 bra $L__BB1_162; bra.uni $L__BB1_154; $L__BB1_162: and.b64 %rd1104, %rd207, 8; setp.eq.s64 %p222, %rd1104, 0; @%p222 bra $L__BB1_71; mov.pred %p342, 0; ld.u8 %rs75, [%rd196+88]; and.b16 %rs76, %rs75, 1; setp.eq.b16 %p223, %rs76, 1; xor.pred %p225, %p223, %p342; not.pred %p226, %p225; @%p226 bra $L__BB1_166; bra.uni $L__BB1_164; $L__BB1_166: ld.u32 %r140, [%rd196+76]; cvt.u64.u32 %rd1108, %r140; setp.le.u64 %p233, %rd184, %rd1108; @%p233 bra $L__BB1_71; mov.b32 %f689, %r92; neg.f32 %f115, %f689; setp.lt.u32 %p234, %r45, 64; @%p234 bra $L__BB1_169; bra.uni $L__BB1_168; $L__BB1_169: mul.wide.u32 %rd1118, %r45, 8; add.s64 %rd1119, %rd16, %rd1118; mov.u64 %rd1530, 0; st.local.u32 [%rd1119], %r140; st.local.f32 [%rd1119+4], %f115; add.s32 %r45, %r45, 1; st.local.u32 [%rd16+512], %r45; mov.u64 %rd1531, %rd1530; bra.uni $L__BB1_170; $L__BB1_136: mov.pred %p339, 0; ld.u8 %rs69, [%rd196+88]; and.b16 %rs70, %rs69, 1; setp.eq.b16 %p181, %rs70, 1; xor.pred %p183, %p181, %p339; not.pred %p184, %p183; @%p184 bra $L__BB1_139; bra.uni $L__BB1_137; $L__BB1_139: ld.u32 %r98, [%rd196+64]; cvt.u64.u32 %rd1060, %r98; setp.le.u64 %p191, %rd184, %rd1060; @%p191 bra $L__BB1_144; mov.b32 %f683, %r89; neg.f32 %f112, %f683; setp.lt.u32 %p192, %r45, 64; @%p192 bra $L__BB1_142; bra.uni $L__BB1_141; $L__BB1_142: add.s32 %r411, %r44, -1; mul.wide.u32 %rd1070, %r411, 8; add.s64 %rd1071, %rd16, %rd1070; mov.u64 %rd1524, 0; st.local.u32 [%rd1071], %r98; st.local.f32 [%rd1071+4], %f112; add.s32 %r45, %r45, 1; st.local.u32 [%rd16+512], %r45; mov.u64 %rd1525, %rd1524; bra.uni $L__BB1_143; $L__BB1_145: mov.pred %p340, 0; ld.u8 %rs71, [%rd196+88]; and.b16 %rs72, %rs71, 1; setp.eq.b16 %p195, %rs72, 1; xor.pred %p197, %p195, %p340; not.pred %p198, %p197; @%p198 bra $L__BB1_148; bra.uni $L__BB1_146; $L__BB1_148: ld.u32 %r112, [%rd196+68]; cvt.u64.u32 %rd1076, %r112; setp.le.u64 %p205, %rd184, %rd1076; @%p205 bra $L__BB1_153; mov.b32 %f685, %r90; neg.f32 %f113, %f685; setp.lt.u32 %p206, %r45, 64; @%p206 bra $L__BB1_151; bra.uni $L__BB1_150; $L__BB1_151: mul.wide.u32 %rd1086, %r45, 8; add.s64 %rd1087, %rd16, %rd1086; mov.u64 %rd1526, 0; st.local.u32 [%rd1087], %r112; st.local.f32 [%rd1087+4], %f113; add.s32 %r45, %r45, 1; st.local.u32 [%rd16+512], %r45; mov.u64 %rd1527, %rd1526; bra.uni $L__BB1_152; $L__BB1_154: mov.pred %p341, 0; ld.u8 %rs73, [%rd196+88]; and.b16 %rs74, %rs73, 1; setp.eq.b16 %p209, %rs74, 1; xor.pred %p211, %p209, %p341; not.pred %p212, %p211; @%p212 bra $L__BB1_157; bra.uni $L__BB1_155; $L__BB1_157: ld.u32 %r126, [%rd196+72]; cvt.u64.u32 %rd1092, %r126; setp.le.u64 %p219, %rd184, %rd1092; @%p219 bra $L__BB1_162; mov.b32 %f687, %r91; neg.f32 %f114, %f687; setp.lt.u32 %p220, %r45, 64; @%p220 bra $L__BB1_160; bra.uni $L__BB1_159; $L__BB1_160: mul.wide.u32 %rd1102, %r45, 8; add.s64 %rd1103, %rd16, %rd1102; mov.u64 %rd1528, 0; st.local.u32 [%rd1103], %r126; st.local.f32 [%rd1103+4], %f114; add.s32 %r45, %r45, 1; st.local.u32 [%rd16+512], %r45; mov.u64 %rd1529, %rd1528; bra.uni $L__BB1_161; $L__BB1_164: mov.b32 %f688, %r92; mov.b32 %f479, %r43; setp.leu.f32 %p227, %f479, %f688; setp.eq.s32 %p228, %r624, 4; or.pred %p229, %p228, %p227; @%p229 bra $L__BB1_71; bra.uni $L__BB1_165; $L__BB1_137: mov.b32 %f682, %r89; mov.b32 %f639, %r43; setp.leu.f32 %p185, %f639, %f682; setp.eq.s32 %p186, %r621, 4; or.pred %p187, %p186, %p185; @%p187 bra $L__BB1_144; ld.u32 %r409, [%rd196+64]; cvt.u64.u32 %rd1057, %r409; setp.le.u64 %p188, %rd187, %rd1057; mul.wide.u32 %rd1058, %r409, 12; add.s64 %rd1059, %rd188, %rd1058; setp.eq.s64 %p189, %rd1059, 0; or.pred %p190, %p188, %p189; selp.b32 %r40, %r40, %r620, %p190; selp.b32 %r39, %r39, %r619, %p190; selp.b32 %r38, %r38, %r618, %p190; selp.b32 %r42, %r42, %r621, %p190; selp.b32 %r43, %r43, %r89, %p190; bra.uni $L__BB1_144; $L__BB1_146: mov.b32 %f684, %r90; mov.b32 %f477, %r43; setp.leu.f32 %p199, %f477, %f684; setp.eq.s32 %p200, %r622, 4; or.pred %p201, %p200, %p199; @%p201 bra $L__BB1_153; ld.u32 %r417, [%rd196+68]; cvt.u64.u32 %rd1073, %r417; setp.le.u64 %p202, %rd187, %rd1073; mul.wide.u32 %rd1074, %r417, 12; add.s64 %rd1075, %rd188, %rd1074; setp.eq.s64 %p203, %rd1075, 0; or.pred %p204, %p202, %p203; selp.b32 %r40, %r40, %r617, %p204; selp.b32 %r39, %r39, %r616, %p204; selp.b32 %r38, %r38, %r615, %p204; selp.b32 %r42, %r42, %r622, %p204; selp.b32 %r43, %r43, %r90, %p204; bra.uni $L__BB1_153; $L__BB1_155: mov.b32 %f686, %r91; mov.b32 %f478, %r43; setp.leu.f32 %p213, %f478, %f686; setp.eq.s32 %p214, %r623, 4; or.pred %p215, %p214, %p213; @%p215 bra $L__BB1_162; ld.u32 %r424, [%rd196+72]; cvt.u64.u32 %rd1089, %r424; setp.le.u64 %p216, %rd187, %rd1089; mul.wide.u32 %rd1090, %r424, 12; add.s64 %rd1091, %rd188, %rd1090; setp.eq.s64 %p217, %rd1091, 0; or.pred %p218, %p216, %p217; selp.b32 %r40, %r40, %r614, %p218; selp.b32 %r39, %r39, %r613, %p218; selp.b32 %r38, %r38, %r612, %p218; selp.b32 %r42, %r42, %r623, %p218; selp.b32 %r43, %r43, %r91, %p218; bra.uni $L__BB1_162; $L__BB1_168: mov.u64 %rd1531, 1; shl.b64 %rd1530, %rd1108, 32; $L__BB1_170: mov.u64 %rd1284, 0; cvt.u32.u64 %r433, %rd1284; cvt.u32.u64 %r434, %rd1530; or.b32 %r435, %r434, %r433; cvt.u32.u64 %r436, %rd1531; or.b32 %r437, %r435, %r436; setp.eq.s32 %p235, %r437, 0; @%p235 bra $L__BB1_71; bra.uni $L__BB1_171; $L__BB1_141: cvt.u64.u32 %rd1358, %r98; mov.u64 %rd1525, 1; shl.b64 %rd1524, %rd1358, 32; $L__BB1_143: mov.u64 %rd1275, 0; cvt.u32.u64 %r412, %rd1275; cvt.u32.u64 %r413, %rd1524; or.b32 %r414, %r413, %r412; cvt.u32.u64 %r415, %rd1525; or.b32 %r416, %r414, %r415; setp.ne.s32 %p193, %r416, 0; @%p193 bra $L__BB1_171; bra.uni $L__BB1_144; $L__BB1_150: mov.u64 %rd1527, 1; shl.b64 %rd1526, %rd1076, 32; $L__BB1_152: mov.u64 %rd1278, 0; cvt.u32.u64 %r419, %rd1278; cvt.u32.u64 %r420, %rd1526; or.b32 %r421, %r420, %r419; cvt.u32.u64 %r422, %rd1527; or.b32 %r423, %r421, %r422; setp.ne.s32 %p207, %r423, 0; @%p207 bra $L__BB1_171; bra.uni $L__BB1_153; $L__BB1_159: mov.u64 %rd1529, 1; shl.b64 %rd1528, %rd1092, 32; $L__BB1_161: mov.u64 %rd1281, 0; cvt.u32.u64 %r426, %rd1281; cvt.u32.u64 %r427, %rd1528; or.b32 %r428, %r427, %r426; cvt.u32.u64 %r429, %rd1529; or.b32 %r430, %r428, %r429; setp.ne.s32 %p221, %r430, 0; @%p221 bra $L__BB1_171; bra.uni $L__BB1_162; $L__BB1_172: mov.u64 %rd1532, 0; mov.u64 %rd1533, 2; setp.eq.s32 %p236, %r42, 4; mov.u64 %rd1534, %rd1532; @%p236 bra $L__BB1_174; mov.b64 %rd1534, {%r38, %r39}; mov.b32 {%rs77, %rs78}, %r40; mov.b64 %rd1126, {%r40, %r438}; and.b64 %rd1532, %rd1126, 4294967040; cvt.u64.u16 %rd1127, %rs77; and.b64 %rd1533, %rd1127, 255; $L__BB1_174: mov.u64 %rd1536, 0; mov.u64 %rd1535, 2; or.b64 %rd1134, %rd1533, %rd1532; or.b64 %rd1135, %rd1134, %rd1536; mov.b64 {%r439, %r440}, %rd1135; mov.b32 {%rs17, %rs79}, %r439; and.b16 %rs80, %rs17, 255; setp.eq.s16 %p237, %rs80, 2; @%p237 bra $L__BB1_176; cvt.u32.u64 %r441, %rd1534; mov.b32 %f480, %r441; shr.u64 %rd1136, %rd1534, 32; cvt.u32.u64 %r442, %rd1136; mov.b32 %f481, %r442; ld.global.f32 %f482, [%rd54+248]; mul.f32 %f483, %f482, %f480; ld.global.f32 %f484, [%rd54+252]; mul.f32 %f485, %f484, %f481; sub.f32 %f486, %f483, %f485; mul.f32 %f487, %f484, %f480; fma.rn.f32 %f488, %f482, %f481, %f487; ld.global.f32 %f489, [%rd54+256]; add.f32 %f490, %f489, %f486; mov.b32 %r443, %f490; ld.global.f32 %f491, [%rd54+260]; add.f32 %f492, %f491, %f488; mov.b32 %r444, %f492; cvt.u64.u32 %rd1137, %r444; cvt.u64.u32 %rd1138, %r443; cvt.u64.u16 %rd1139, %rs17; bfi.b64 %rd1536, %rd1137, %rd1138, 32, 32; and.b64 %rd1140, %rd1139, 255; mov.b64 {%r445, %r446}, %rd1140; mov.b32 {%rs81, %rs82}, %r445; cvt.u64.u16 %rd1535, %rs81; $L__BB1_176: mov.u64 %rd1538, 0; mov.u64 %rd1537, 2; or.b64 %rd1147, %rd1538, %rd1535; or.b64 %rd536, %rd1147, %rd1538; mov.b64 {%r447, %r448}, %rd536; mov.b32 {%rs18, %rs83}, %r447; and.b16 %rs84, %rs18, 255; setp.eq.s16 %p238, %rs84, 2; mov.u64 %rd1539, %rd1538; @%p238 bra $L__BB1_178; mov.u64 %rd1346, 0; and.b64 %rd1149, %rd536, 4294967040; cvt.u64.u16 %rd1150, %rs18; and.b64 %rd1151, %rd1150, 255; or.b64 %rd1152, %rd1151, %rd1346; or.b64 %rd1153, %rd1152, %rd1149; mov.b64 {%r449, %r450}, %rd1153; mov.b32 {%rs85, %rs86}, %r449; not.b16 %rs87, %rs85; ld.global.u8 %rs88, [%rd54+240]; setp.eq.s16 %p239, %rs88, 0; and.b16 %rs89, %rs87, 1; selp.b16 %rs90, %rs85, %rs89, %p239; and.b64 %rd1154, %rd1153, 4294967040; cvt.u64.u16 %rd1155, %rs90; and.b64 %rd1156, %rd1155, 255; or.b64 %rd1157, %rd1154, %rd1346; or.b64 %rd1158, %rd1157, %rd1156; mov.b64 {%r451, %r452}, %rd1158; mov.b32 {%rs91, %rs92}, %r451; and.b64 %rd1539, %rd1158, 4294967040; cvt.u64.u16 %rd1159, %rs91; and.b64 %rd1537, %rd1159, 255; mov.u64 %rd1538, %rd1536; $L__BB1_178: mov.u64 %rd1347, 0; or.b64 %rd1160, %rd1538, %rd1347; or.b64 %rd1161, %rd1347, %rd1537; or.b64 %rd1162, %rd1161, %rd1539; or.b64 %rd1163, %rd1160, %rd1347; mov.b64 {%r655, %r656}, %rd1163; mov.b64 {%r657, %r453}, %rd1162; bra.uni $L__BB1_235; $L__BB1_38: cvt.u32.u64 %r246, %rd64; cvt.u32.u64 %r247, %rd83; rem.u32 %r248, %r247, %r246; cvt.u64.u32 %rd1379, %r248; $L__BB1_39: shl.b64 %rd697, %rd1379, 3; add.s64 %rd87, %rd65, %rd697; ld.u32 %rd698, [%rd87]; ld.u32 %rd699, [%rd87+4]; bfi.b64 %rd88, %rd699, %rd698, 32, 32; add.s64 %rd89, %rd1379, 1; or.b64 %rd700, %rd89, %rd64; and.b64 %rd701, %rd700, -4294967296; setp.eq.s64 %p56, %rd701, 0; @%p56 bra $L__BB1_41; rem.u64 %rd1380, %rd89, %rd64; bra.uni $L__BB1_42; $L__BB1_51: cvt.u32.u64 %r256, %rd64; cvt.u32.u64 %r257, %rd130; rem.u32 %r258, %r257, %r256; cvt.u64.u32 %rd1396, %r258; $L__BB1_52: add.s64 %rd1410, %rd3, 16; add.u64 %rd1323, %SP, 16; or.b64 %rd1406, %rd1323, 8; add.s64 %rd1404, %rd3, 8; add.u64 %rd1320, %SP, 208; add.s64 %rd1400, %rd1320, 36; add.s64 %rd1398, %rd4, 36; shl.b64 %rd743, %rd1396, 3; add.s64 %rd744, %rd65, %rd743; ld.u32 %rd745, [%rd744]; ld.u32 %rd746, [%rd744+4]; bfi.b64 %rd141, %rd746, %rd745, 32, 32; st.local.v2.u64 [%rd3], {%rd131, %rd141}; mov.u64 %rd1411, 2; mov.u64 %rd1397, %rd37; mov.u64 %rd1399, %rd1398; mov.u64 %rd1401, %rd1398; mov.u64 %rd1402, %rd1398; mov.u64 %rd1403, %rd1400; mov.u64 %rd1405, %rd1404; mov.u64 %rd1407, %rd1404; mov.u64 %rd1408, %rd1404; mov.u64 %rd1409, %rd1406; $L__BB1_53: setp.eq.s64 %p65, %rd1411, 0; @%p65 bra $L__BB1_56; add.s64 %rd1411, %rd1411, -1; add.s64 %rd747, %rd1398, 8; setp.eq.s64 %p66, %rd1401, %rd1397; selp.b64 %rd748, %rd747, %rd1401, %p66; add.s64 %rd749, %rd1399, 8; selp.b64 %rd750, %rd749, %rd1402, %p66; add.s64 %rd751, %rd1400, 8; selp.b64 %rd752, %rd751, %rd1403, %p66; setp.eq.s64 %p67, %rd1411, 0; add.s64 %rd753, %rd748, 4; add.s64 %rd754, %rd750, 4; add.s64 %rd755, %rd752, 4; selp.b64 %rd158, %rd748, %rd753, %p67; selp.b64 %rd1402, %rd750, %rd754, %p67; selp.b64 %rd1403, %rd752, %rd755, %p67; selp.b64 %rd1398, %rd747, %rd1398, %p66; selp.b64 %rd1399, %rd749, %rd1399, %p66; selp.b64 %rd1400, %rd751, %rd1400, %p66; add.s64 %rd756, %rd1401, 8; selp.b64 %rd1397, %rd756, %rd1397, %p66; add.s64 %rd757, %rd1407, 8; setp.eq.s64 %p68, %rd1404, %rd1410; selp.b64 %rd758, %rd757, %rd1404, %p68; add.s64 %rd759, %rd1408, 8; selp.b64 %rd760, %rd759, %rd1405, %p68; add.s64 %rd761, %rd1409, 8; selp.b64 %rd762, %rd761, %rd1406, %p68; selp.b64 %rd1407, %rd757, %rd1407, %p68; selp.b64 %rd1408, %rd759, %rd1408, %p68; selp.b64 %rd1409, %rd761, %rd1409, %p68; add.s64 %rd763, %rd1404, 8; selp.b64 %rd1410, %rd763, %rd1410, %p68; add.s64 %rd764, %rd758, 4; add.s64 %rd765, %rd760, 4; add.s64 %rd766, %rd762, 4; selp.b64 %rd1404, %rd758, %rd764, %p67; selp.b64 %rd1405, %rd760, %rd765, %p67; selp.b64 %rd1406, %rd762, %rd766, %p67; ld.local.f32 %f252, [%rd760]; ld.local.f32 %f253, [%rd750]; setp.eq.f32 %p69, %f253, %f252; mov.u64 %rd1401, %rd158; @%p69 bra $L__BB1_53; bra.uni $L__BB1_55; $L__BB1_56: cvt.u32.u64 %r259, %rd131; mov.b32 %f254, %r259; shr.u64 %rd767, %rd131, 32; cvt.u32.u64 %r260, %rd767; mov.b32 %f255, %r260; shr.u64 %rd768, %rd141, 32; cvt.u32.u64 %r261, %rd768; cvt.u32.u64 %r262, %rd141; mov.b32 %f256, %r262; sub.f32 %f257, %f256, %f254; mov.b32 %f258, %r261; sub.f32 %f259, %f258, %f255; neg.f32 %f704, %f257; neg.f32 %f705, %f259; bra.uni $L__BB1_57; $L__BB1_41: cvt.u32.u64 %r249, %rd64; cvt.u32.u64 %r250, %rd89; rem.u32 %r251, %r250, %r249; cvt.u64.u32 %rd1380, %r251; $L__BB1_42: add.u64 %rd1317, %SP, 208; add.s64 %rd1384, %rd1317, 44; add.s64 %rd1381, %rd4, 52; add.u64 %rd1388, %SPL, 16; add.u64 %rd1390, %SP, 16; shl.b64 %rd703, %rd1380, 3; add.s64 %rd99, %rd65, %rd703; ld.u32 %rd704, [%rd99]; ld.u32 %rd705, [%rd99+4]; bfi.b64 %rd706, %rd705, %rd704, 32, 32; st.local.v2.u64 [%rd1388], {%rd88, %rd706}; mov.u64 %rd1395, 2; mov.u64 %rd1382, %rd37; mov.u64 %rd1383, %rd37; mov.u64 %rd1385, %rd37; mov.u64 %rd1386, %rd37; mov.u64 %rd1387, %rd1384; mov.u64 %rd1389, %rd1388; mov.u64 %rd1391, %rd1388; mov.u64 %rd1392, %rd1388; mov.u64 %rd1393, %rd1390; mov.u64 %rd1394, %rd39; $L__BB1_43: setp.eq.s64 %p57, %rd1395, 0; @%p57 bra $L__BB1_46; add.s64 %rd1395, %rd1395, -1; add.s64 %rd707, %rd1382, 8; setp.eq.s64 %p58, %rd1385, %rd1381; selp.b64 %rd708, %rd707, %rd1385, %p58; add.s64 %rd709, %rd1383, 8; selp.b64 %rd710, %rd709, %rd1386, %p58; add.s64 %rd711, %rd1384, 8; selp.b64 %rd712, %rd711, %rd1387, %p58; setp.eq.s64 %p59, %rd1395, 0; add.s64 %rd713, %rd708, 4; add.s64 %rd714, %rd710, 4; add.s64 %rd715, %rd712, 4; selp.b64 %rd116, %rd708, %rd713, %p59; selp.b64 %rd1386, %rd710, %rd714, %p59; selp.b64 %rd1387, %rd712, %rd715, %p59; selp.b64 %rd1382, %rd707, %rd1382, %p58; selp.b64 %rd1383, %rd709, %rd1383, %p58; selp.b64 %rd1384, %rd711, %rd1384, %p58; add.s64 %rd716, %rd1385, 8; selp.b64 %rd1381, %rd716, %rd1381, %p58; add.s64 %rd717, %rd1391, 8; setp.eq.s64 %p60, %rd1388, %rd1394; selp.b64 %rd718, %rd717, %rd1388, %p60; add.s64 %rd719, %rd1392, 8; selp.b64 %rd720, %rd719, %rd1389, %p60; add.s64 %rd721, %rd1393, 8; selp.b64 %rd722, %rd721, %rd1390, %p60; selp.b64 %rd1391, %rd717, %rd1391, %p60; selp.b64 %rd1392, %rd719, %rd1392, %p60; selp.b64 %rd1393, %rd721, %rd1393, %p60; add.s64 %rd723, %rd1388, 8; selp.b64 %rd1394, %rd723, %rd1394, %p60; add.s64 %rd724, %rd718, 4; add.s64 %rd725, %rd720, 4; add.s64 %rd726, %rd722, 4; selp.b64 %rd1388, %rd718, %rd724, %p59; selp.b64 %rd1389, %rd720, %rd725, %p59; selp.b64 %rd1390, %rd722, %rd726, %p59; ld.local.f32 %f246, [%rd720]; ld.local.f32 %f247, [%rd710]; setp.eq.f32 %p61, %f247, %f246; mov.u64 %rd1385, %rd116; @%p61 bra $L__BB1_43; bra.uni $L__BB1_45; $L__BB1_46: ld.u32 %rd727, [%rd87]; ld.u32 %rd728, [%rd87+4]; bfi.b64 %rd729, %rd728, %rd727, 32, 32; cvt.u32.u64 %r252, %rd729; mov.b32 %f248, %r252; shr.u64 %rd730, %rd729, 32; cvt.u32.u64 %r253, %rd730; mov.b32 %f249, %r253; ld.u32 %rd731, [%rd99]; ld.u32 %rd732, [%rd99+4]; bfi.b64 %rd733, %rd732, %rd731, 32, 32; cvt.u32.u64 %r254, %rd733; shr.u64 %rd734, %rd733, 32; cvt.u32.u64 %r255, %rd734; mov.b32 %f250, %r254; sub.f32 %f704, %f250, %f248; mov.b32 %f251, %r255; sub.f32 %f705, %f251, %f249; $L__BB1_57: mul.f32 %f260, %f41, %f705; fma.rn.f32 %f48, %f40, %f704, %f260; mul.f32 %f261, %f705, %f705; fma.rn.f32 %f262, %f704, %f704, %f261; add.f32 %f263, %f262, 0f00000000; sqrt.rn.f32 %f264, %f263; mul.f32 %f265, %f264, 0f3A83126F; abs.f32 %f266, %f48; setp.gt.f32 %p70, %f266, %f265; @%p70 bra $L__BB1_59; bra.uni $L__BB1_58; $L__BB1_59: setp.ge.f32 %p344, %f48, 0f00000000; bra.uni $L__BB1_62; $L__BB1_58: ld.local.u64 %rd769, [%rd4+8]; cvt.u32.u64 %r263, %rd769; mov.b32 %f267, %r263; shr.u64 %rd770, %rd769, 32; cvt.u32.u64 %r264, %rd770; mov.b32 %f268, %r264; sub.f32 %f269, %f2, %f267; sub.f32 %f270, %f3, %f268; mul.f32 %f271, %f41, %f270; fma.rn.f32 %f272, %f40, %f269, %f271; setp.le.f32 %p344, %f272, 0f00000000; $L__BB1_62: selp.u16 %rs28, 1, 0, %p344; st.local.u8 [%rd4+16], %rs28; $L__BB1_63: ld.local.v2.u32 {%r593, %r594}, [%rd4+8]; ld.local.u32 %r595, [%rd4+16]; $L__BB1_65: setp.eq.s32 %p71, %r23, 2; mov.u64 %rd778, 0; mov.u64 %rd1412, 2; mov.u64 %rd1413, %rd778; @%p71 bra $L__BB1_67; setp.ne.s16 %p72, %rs2, 0; cvt.u16.u32 %rs30, %r595; selp.u16 %rs31, 1, 0, %p72; xor.b16 %rs32, %rs30, %rs31; mov.b32 %f279, %r593; mov.b32 %f280, %r594; mul.f32 %f281, %f11, %f279; ld.global.f32 %f282, [%rd54+252]; mul.f32 %f283, %f282, %f280; sub.f32 %f284, %f281, %f283; mul.f32 %f285, %f282, %f279; fma.rn.f32 %f286, %f11, %f280, %f285; add.f32 %f287, %f9, %f284; mov.b32 %r269, %f287; add.f32 %f288, %f10, %f286; mov.b32 %r270, %f288; cvt.u64.u32 %rd779, %r270; cvt.u64.u32 %rd780, %r269; cvt.u64.u16 %rd781, %rs32; bfi.b64 %rd1413, %rd779, %rd780, 32, 32; and.b64 %rd782, %rd781, 255; mov.b64 {%r271, %r272}, %rd782; mov.b32 {%rs33, %rs34}, %r271; cvt.u64.u16 %rd1412, %rs33; $L__BB1_67: or.b64 %rd783, %rd778, %rd778; or.b64 %rd784, %rd1412, %rd778; or.b64 %rd785, %rd784, %rd778; or.b64 %rd786, %rd783, %rd1413; mov.b64 {%r655, %r656}, %rd786; mov.b64 {%r657, %r273}, %rd785; $L__BB1_235: add.s64 %rd1368, %rd57, 1; add.s64 %rd1365, %rd54, 280; mov.b32 {%rs23, %rs102}, %r657; and.b16 %rs103, %rs23, 255; setp.eq.s16 %p317, %rs103, 2; @%p317 bra $L__BB1_5; add.s64 %rd1368, %rd57, 1; add.s64 %rd1365, %rd54, 280; mov.b64 %rd1235, {%r655, %r656}; mov.b32 %f593, %r655; shr.u64 %rd1236, %rd1235, 32; cvt.u32.u64 %r521, %rd1236; mov.b32 %f594, %r521; sub.f32 %f175, %f2, %f593; sub.f32 %f176, %f3, %f594; mul.f32 %f595, %f176, %f176; fma.rn.f32 %f596, %f175, %f175, %f595; add.f32 %f597, %f596, 0f00000000; sqrt.rn.f32 %f699, %f597; setp.geu.f32 %p318, %f699, %f8; @%p318 bra $L__BB1_5; bra.uni $L__BB1_237; $L__BB1_238: and.b16 %rs115, %rs1, 255; setp.eq.s16 %p338, %rs115, 2; @%p338 bra $L__BB1_255; bra.uni $L__BB1_239; $L__BB1_255: mov.b32 %r659, %f4; mov.b32 %r658, %f5; mov.u64 %rd1263, 3; st.global.u64 [%rd23+20], %rd1263; bra.uni $L__BB1_256; $L__BB1_239: and.b16 %rs105, %rs1, 1; setp.eq.b16 %p320, %rs105, 1; selp.b64 %rd1553, 1, 2, %p320; st.global.u64 [%rd23+20], %rd1553; st.global.u64 [%rd23+28], %rd48; st.global.u64 [%rd23+36], %rd49; $L__BB1_240: mov.b32 %r659, %f4; mov.b32 %r658, %f5; cvt.u16.u64 %rs106, %rd1553; shl.b16 %rs107, %rs106, 14; add.s16 %rs108, %rs107, -16384; shr.s16 %rs109, %rs108, 14; setp.lt.s16 %p321, %rs109, 0; @%p321 bra $L__BB1_256; ld.param.u64 %rd1326, [grid_update_param_3]; ld.param.u64 %rd1325, [grid_update_param_2]; ld.global.u64 %rd594, [%rd23+28]; setp.ge.u64 %p322, %rd594, %rd1326; mul.lo.s64 %rd1239, %rd594, 280; add.s64 %rd1240, %rd1325, %rd1239; setp.eq.s64 %p323, %rd1240, 0; or.pred %p324, %p322, %p323; @%p324 bra $L__BB1_254; ld.param.u64 %rd1328, [grid_update_param_2]; cvta.to.global.u64 %rd1327, %rd1328; add.s64 %rd595, %rd1327, %rd1239; ld.global.u16 %rs110, [%rd595+272]; setp.eq.s16 %p325, %rs110, 0; @%p325 bra $L__BB1_253; mov.b32 %r659, %f4; mov.b32 %r658, %f5; setp.eq.s16 %p326, %rs110, 3; @%p326 bra $L__BB1_256; ld.global.u64 %rd1248, [%rd23+36]; mov.u64 %rd1247, 0; cvt.u32.u64 %r524, %rd1248; mov.b32 %f178, %r524; shr.u64 %rd1249, %rd1248, 32; cvt.u32.u64 %r525, %rd1249; mov.b32 %f179, %r525; mul.f32 %f598, %f179, %f179; fma.rn.f32 %f599, %f178, %f178, %f598; add.f32 %f180, %f599, 0f00000000; setp.leu.f32 %p327, %f180, 0f2EDBE6FE; mov.u64 %rd1554, %rd1247; mov.u64 %rd1555, %rd1247; mov.u64 %rd1556, %rd1247; @%p327 bra $L__BB1_246; sqrt.rn.f32 %f600, %f180; mov.b32 %r526, %f600; div.rn.f32 %f601, %f178, %f600; div.rn.f32 %f602, %f179, %f600; mov.b32 %r527, %f601; mov.b32 %r528, %f602; cvt.u64.u32 %rd1252, %r528; cvt.u64.u32 %rd1253, %r527; cvt.u64.u32 %rd1555, %r526; mov.u64 %rd1556, 1; bfi.b64 %rd1554, %rd1252, %rd1253, 32, 32; $L__BB1_246: mov.b32 %r659, %f4; mov.b32 %r658, %f5; or.b64 %rd1254, %rd1247, %rd1554; or.b64 %rd1255, %rd1555, %rd1247; shr.u64 %rd1256, %rd1254, 32; shl.b64 %rd1257, %rd1255, 32; or.b64 %rd1258, %rd1257, %rd1256; or.b64 %rd607, %rd1258, %rd1247; xor.b64 %rd1260, %rd1556, 1; or.b64 %rd1261, %rd1260, %rd1247; setp.ne.s64 %p328, %rd1261, 0; @%p328 bra $L__BB1_256; mov.b32 %r659, %f4; mov.b32 %r658, %f5; setp.eq.s64 %p329, %rd1553, 1; cvt.u32.u64 %r529, %rd1554; mov.b32 %f603, %r529; shr.u64 %rd1262, %rd1554, 32; cvt.u32.u64 %r530, %rd1262; mov.b32 %f604, %r530; neg.f32 %f605, %f603; neg.f32 %f606, %f604; mov.b32 %r531, %f605; selp.b32 %r177, %r531, %r529, %p329; selp.f32 %f607, %f605, %f603, %p329; selp.f32 %f181, %f606, %f604, %p329; mul.f32 %f608, %f5, %f181; fma.rn.f32 %f182, %f4, %f607, %f608; setp.geu.f32 %p330, %f182, 0f00000000; @%p330 bra $L__BB1_256; ld.param.f32 %f642, [grid_update_param_1]; mov.b64 {%r532, %r533}, %rd607; mov.b32 %f609, %r533; sub.f32 %f183, %f609, %f642; setp.le.f32 %p332, %f183, 0f00000000; or.pred %p333, %p329, %p332; @%p333 bra $L__BB1_251; bra.uni $L__BB1_249; $L__BB1_251: mov.b32 %f619, %r177; mul.f32 %f620, %f182, %f619; sub.f32 %f184, %f4, %f620; mov.b32 %r659, %f184; mul.f32 %f621, %f181, %f182; sub.f32 %f185, %f5, %f621; mov.b32 %r658, %f185; mul.f32 %f622, %f185, %f185; fma.rn.f32 %f623, %f184, %f184, %f622; add.f32 %f624, %f623, 0f00000000; sqrt.rn.f32 %f186, %f624; setp.leu.f32 %p335, %f186, 0f2EDBE6FF; @%p335 bra $L__BB1_256; ld.global.f32 %f625, [%rd595+264]; fma.rn.f32 %f626, %f182, %f625, %f186; mov.f32 %f627, 0f00000000; max.f32 %f628, %f626, %f627; div.rn.f32 %f629, %f184, %f186; mul.f32 %f630, %f629, %f628; mov.b32 %r659, %f630; div.rn.f32 %f631, %f185, %f186; mul.f32 %f632, %f631, %f628; mov.b32 %r658, %f632; bra.uni $L__BB1_256; $L__BB1_253: mov.b32 %r556, %f4; mov.b32 %r555, %f5; setp.eq.s64 %p336, %rd1553, 1; selp.b32 %r658, 0, %r555, %p336; selp.b32 %r659, 0, %r556, %p336; $L__BB1_256: st.global.u32 [%rd23], %r659; st.global.u32 [%rd23+4], %r658; ld.global.f32 %f633, [%rd23+12]; setp.eq.f32 %p337, %f633, 0f00000000; rcp.rn.f32 %f634, %f633; selp.f32 %f635, 0f00000000, %f634, %p337; ld.global.f32 %f636, [%rd23+8]; mul.f32 %f637, %f636, %f635; st.global.f32 [%rd23+8], %f637; $L__BB1_257: ret; $L__BB1_249: mov.b32 %r659, %f4; mov.b32 %r658, %f5; ld.param.f32 %f643, [grid_update_param_0]; mul.f32 %f610, %f182, %f643; neg.f32 %f611, %f610; setp.geu.f32 %p334, %f183, %f611; @%p334 bra $L__BB1_256; ld.param.f32 %f644, [grid_update_param_0]; div.rn.f32 %f612, %f183, %f644; add.f32 %f613, %f182, %f612; mov.b32 %f614, %r177; mul.f32 %f615, %f613, %f614; mul.f32 %f616, %f181, %f613; sub.f32 %f617, %f4, %f615; mov.b32 %r659, %f617; sub.f32 %f618, %f5, %f616; mov.b32 %r658, %f618; bra.uni $L__BB1_256; $L__BB1_254: trap; $L__BB1_64: trap; $L__BB1_183: trap; $L__BB1_186: trap; $L__BB1_188: trap; $L__BB1_208: trap; $L__BB1_60: trap; $L__BB1_48: trap; $L__BB1_74: trap; $L__BB1_83: trap; $L__BB1_85: trap; $L__BB1_87: trap; $L__BB1_89: trap; $L__BB1_171: { // callseq 0, 0 .reg .b32 temp_param_reg; call.uni _ZN4core6result13unwrap_failed17ha84c04ab95b75c50E, ( ); } // callseq 0 $L__BB1_55: trap; $L__BB1_45: trap; } // .globl reset_hashmap .visible .entry reset_hashmap( .param .align 8 .b8 reset_hashmap_param_0[16] ) { .reg .pred %p<2>; .reg .b32 %r<25>; .reg .b64 %rd<11>; ld.param.u32 %r2, [reset_hashmap_param_0+8]; ld.param.u64 %rd1, [reset_hashmap_param_0]; mov.u32 %r3, %ntid.z; mov.u32 %r4, %ntid.y; mov.u32 %r5, %ntid.x; mov.b64 %rd2, {%r5, %r4}; mov.u32 %r6, %ctaid.z; mov.u32 %r7, %nctaid.y; mov.u32 %r8, %ctaid.y; mad.lo.s32 %r9, %r6, %r7, %r8; mov.u32 %r10, %nctaid.x; mov.u32 %r11, %ctaid.x; mad.lo.s32 %r12, %r9, %r10, %r11; and.b64 %rd3, %rd2, 4294967295; cvt.u64.u32 %rd4, %r4; bfi.b64 %rd5, %rd4, %rd3, 32, 32; cvt.u64.u32 %rd6, %r3; mov.b64 {%r13, %r14}, %rd5; mov.b64 {%r15, %r16}, %rd6; mul.lo.s32 %r17, %r13, %r12; mul.lo.s32 %r18, %r17, %r14; mov.u32 %r19, %tid.z; mov.u32 %r20, %tid.y; mad.lo.s32 %r21, %r19, %r4, %r20; mov.u32 %r22, %tid.x; mad.lo.s32 %r23, %r21, %r5, %r22; mad.lo.s32 %r1, %r18, %r15, %r23; setp.ge.u32 %p1, %r1, %r2; @%p1 bra $L__BB2_2; cvta.to.global.u64 %rd7, %rd1; mul.wide.u32 %rd8, %r1, 16; add.s64 %rd9, %rd7, %rd8; mov.u64 %rd10, -1; st.global.u64 [%rd9], %rd10; mov.u32 %r24, 0; st.global.u32 [%rd9+8], %r24; $L__BB2_2: ret; } // .globl add_data_grp .visible .entry add_data_grp( .param .u64 add_data_grp_param_0, .param .u32 add_data_grp_param_1, .param .u64 add_data_grp_param_2 ) { .reg .pred %p<2>; .reg .b32 %r<9>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [add_data_grp_param_0]; ld.param.u32 %r3, [add_data_grp_param_1]; ld.param.u64 %rd2, [add_data_grp_param_2]; mov.u32 %r4, %ntid.x; mov.u32 %r1, %ctaid.x; mov.u32 %r5, %tid.x; mad.lo.s32 %r2, %r4, %r1, %r5; setp.ge.u32 %p1, %r2, %r3; @%p1 bra $L__BB3_2; cvta.to.global.u64 %rd3, %rd1; mul.wide.u32 %rd4, %r2, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.u32 %r6, [%rd5]; ld.global.u32 %r7, [%rd8]; add.s32 %r8, %r6, %r7; st.global.u32 [%rd5], %r8; $L__BB3_2: ret; } // .globl prefix_sum_512 .visible .entry prefix_sum_512( .param .u64 prefix_sum_512_param_0, .param .u32 prefix_sum_512_param_1, .param .u64 prefix_sum_512_param_2 ) { .reg .pred %p<12>; .reg .b32 %r<22>; .reg .b64 %rd<63>; // demoted variable .shared .align 4 .b8 _ZN16sparkl2d_kernels4cuda10prefix_sum14prefix_sum_51212shared_array6SHARED17h1f773b50ba2bbc70E[2048]; ld.param.u64 %rd20, [prefix_sum_512_param_0]; ld.param.u32 %r5, [prefix_sum_512_param_1]; ld.param.u64 %rd21, [prefix_sum_512_param_2]; mov.u32 %r1, %ctaid.x; shl.b32 %r2, %r1, 9; setp.ge.u32 %p1, %r2, %r5; @%p1 bra $L__BB4_17; mov.u32 %r7, %tid.x; cvt.u64.u32 %rd22, %r5; cvt.u64.u32 %rd1, %r1; mul.wide.u32 %rd23, %r1, 512; sub.s64 %rd24, %rd22, %rd23; setp.lt.u64 %p2, %rd24, 2; add.s64 %rd25, %rd24, -1; mov.u64 %rd26, -1; clz.b64 %r8, %rd25; shr.u64 %rd27, %rd26, %r8; add.s64 %rd28, %rd27, 1; selp.b64 %rd29, 1, %rd28, %p2; min.u64 %rd2, %rd29, 512; max.u64 %rd3, %rd2, 1; add.s32 %r9, %r2, %r7; cvt.u64.u32 %rd4, %r9; cvt.u64.u32 %rd5, %r7; setp.ge.u32 %p3, %r9, %r5; cvta.to.global.u64 %rd30, %rd20; mul.wide.u32 %rd31, %r9, 4; add.s64 %rd6, %rd30, %rd31; mov.u32 %r21, 0; @%p3 bra $L__BB4_3; ld.global.u32 %r21, [%rd6]; $L__BB4_3: shl.b64 %rd32, %rd5, 2; mov.u64 %rd33, _ZN16sparkl2d_kernels4cuda10prefix_sum14prefix_sum_51212shared_array6SHARED17h1f773b50ba2bbc70E; add.s64 %rd7, %rd33, %rd32; st.shared.u32 [%rd7], %r21; shr.u64 %rd62, %rd3, 1; setp.eq.s64 %p4, %rd62, 0; @%p4 bra $L__BB4_8; shl.b64 %rd9, %rd5, 1; mov.u64 %rd60, 1; or.b64 %rd10, %rd9, 1; mov.u64 %rd59, %rd62; $L__BB4_5: bar.sync 0; setp.le.u64 %p5, %rd59, %rd5; @%p5 bra $L__BB4_7; mul.lo.s64 %rd35, %rd60, %rd10; add.s64 %rd36, %rd35, %rd60; shl.b64 %rd37, %rd36, 2; add.s64 %rd39, %rd33, %rd37; mul.lo.s64 %rd40, %rd60, %rd9; add.s64 %rd41, %rd40, %rd60; shl.b64 %rd42, %rd41, 2; add.s64 %rd43, %rd33, %rd42; ld.shared.u32 %r10, [%rd39+-4]; ld.shared.u32 %r11, [%rd43+-4]; add.s32 %r12, %r10, %r11; st.shared.u32 [%rd39+-4], %r12; $L__BB4_7: shr.u64 %rd59, %rd59, 1; shl.b64 %rd60, %rd60, 1; setp.ne.s64 %p6, %rd59, 0; @%p6 bra $L__BB4_5; $L__BB4_8: setp.ne.s32 %p7, %r7, 0; @%p7 bra $L__BB4_10; shl.b64 %rd44, %rd3, 2; add.s64 %rd46, %rd33, %rd44; cvta.to.global.u64 %rd47, %rd21; shl.b64 %rd48, %rd1, 2; add.s64 %rd49, %rd47, %rd48; ld.shared.u32 %r14, [%rd46+-4]; st.global.u32 [%rd49], %r14; mov.u32 %r15, 0; st.shared.u32 [%rd46+-4], %r15; $L__BB4_10: setp.lt.u64 %p8, %rd2, 2; bar.sync 0; @%p8 bra $L__BB4_15; shl.b64 %rd15, %rd5, 1; mov.u64 %rd61, 1; $L__BB4_12: setp.le.u64 %p9, %rd61, %rd5; @%p9 bra $L__BB4_14; mul.lo.s64 %rd51, %rd62, %rd15; add.s64 %rd52, %rd51, %rd62; shl.b64 %rd53, %rd52, 2; add.s64 %rd55, %rd33, %rd53; add.s64 %rd56, %rd55, -4; ld.shared.u32 %r16, [%rd55+-4]; shl.b64 %rd57, %rd62, 2; add.s64 %rd58, %rd56, %rd57; ld.shared.u32 %r17, [%rd58]; st.shared.u32 [%rd55+-4], %r17; add.s32 %r18, %r17, %r16; st.shared.u32 [%rd58], %r18; $L__BB4_14: shl.b64 %rd61, %rd61, 1; shr.u64 %rd62, %rd62, 1; setp.lt.u64 %p10, %rd61, %rd3; bar.sync 0; @%p10 bra $L__BB4_12; $L__BB4_15: cvt.u32.u64 %r19, %rd4; setp.ge.u32 %p11, %r19, %r5; @%p11 bra $L__BB4_17; ld.shared.u32 %r20, [%rd7]; st.global.u32 [%rd6], %r20; $L__BB4_17: ret; } // .globl reset_grid .visible .entry reset_grid( .param .align 8 .b8 reset_grid_param_0[72] ) { .reg .pred %p<2>; .reg .f32 %f<2>; .reg .b32 %r<7>; .reg .b64 %rd<17>; ld.param.u64 %rd8, [reset_grid_param_0+64]; ld.param.u64 %rd2, [reset_grid_param_0+8]; mov.u32 %r3, %tid.y; mov.u32 %r4, %tid.x; mov.u32 %r5, %ctaid.x; mul.wide.u32 %rd9, %r5, 16; cvt.u64.u32 %rd10, %r4; add.s64 %rd11, %rd10, %rd9; mul.wide.u32 %rd12, %r3, 4; add.s64 %rd1, %rd11, %rd12; setp.le.u64 %p1, %rd8, %rd1; @%p1 bra $L__BB5_2; mul.lo.s64 %rd13, %rd1, 48; mov.u64 %rd14, 0; cvta.to.global.u64 %rd15, %rd2; add.s64 %rd16, %rd15, %rd13; mov.u32 %r6, 0; st.global.u32 [%rd16+8], %rd14; st.global.u64 [%rd16], %rd14; st.global.u32 [%rd16+12], %r6; st.global.u64 [%rd16+16], %rd14; st.global.u64 [%rd16+24], %rd14; st.global.u64 [%rd16+40], %rd14; $L__BB5_2: ret; } // .globl copy_grid_projection_data .visible .entry copy_grid_projection_data( .param .align 8 .b8 copy_grid_projection_data_param_0[72], .param .align 8 .b8 copy_grid_projection_data_param_1[72] ) { .reg .pred %p<8>; .reg .f32 %f<3>; .reg .b32 %r<11>; .reg .b64 %rd<71>; ld.param.u64 %rd31, [copy_grid_projection_data_param_1+64]; ld.param.u64 %rd26, [copy_grid_projection_data_param_1+16]; ld.param.u64 %rd25, [copy_grid_projection_data_param_1+8]; ld.param.u64 %rd24, [copy_grid_projection_data_param_0+64]; ld.param.u32 %r2, [copy_grid_projection_data_param_0+40]; ld.param.u64 %rd21, [copy_grid_projection_data_param_0+32]; ld.param.u64 %rd18, [copy_grid_projection_data_param_0+8]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd32, %rd26; mov.u32 %r5, %ctaid.x; mul.wide.u32 %rd33, %r5, 24; add.s64 %rd34, %rd32, %rd33; ld.global.u64 %rd2, [%rd34]; shr.u64 %rd35, %rd2, 16; xor.b64 %rd36, %rd35, %rd2; mul.lo.s64 %rd37, %rd36, 2246822507; shr.u64 %rd38, %rd37, 13; xor.b64 %rd39, %rd38, %rd37; mul.lo.s64 %rd40, %rd39, 3266489909; shr.u64 %rd41, %rd40, 16; xor.b64 %rd42, %rd41, %rd40; cvt.u64.u32 %rd43, %r2; add.s64 %rd3, %rd43, -1; and.b64 %rd67, %rd42, %rd3; shl.b64 %rd44, %rd67, 4; add.s64 %rd45, %rd1, %rd44; ld.global.u64 %rd5, [%rd45]; setp.eq.s64 %p1, %rd5, %rd2; @%p1 bra $L__BB6_5; setp.eq.s64 %p2, %rd5, -1; @%p2 bra $L__BB6_10; $L__BB6_3: add.s64 %rd46, %rd67, 1; and.b64 %rd67, %rd46, %rd3; shl.b64 %rd47, %rd67, 4; add.s64 %rd48, %rd1, %rd47; ld.global.u64 %rd8, [%rd48]; setp.eq.s64 %p3, %rd8, %rd2; @%p3 bra $L__BB6_5; setp.eq.s64 %p4, %rd8, -1; @%p4 bra $L__BB6_10; bra.uni $L__BB6_3; $L__BB6_5: shl.b64 %rd51, %rd67, 4; add.s64 %rd52, %rd1, %rd51; mul.wide.u32 %rd53, %r5, 16; mov.u32 %r7, %tid.y; mov.u32 %r8, %tid.x; mov.u64 %rd69, 0; cvt.u64.u32 %rd54, %r8; mul.wide.u32 %rd55, %r7, 4; add.s64 %rd56, %rd55, %rd54; add.s64 %rd10, %rd56, %rd53; ld.global.u32 %r9, [%rd52+8]; mul.wide.u32 %rd57, %r9, 16; add.s64 %rd58, %rd57, %rd56; setp.le.u64 %p5, %rd24, %rd58; cvta.to.global.u64 %rd59, %rd18; mul.lo.s64 %rd60, %rd58, 48; add.s64 %rd11, %rd59, %rd60; add.s64 %rd12, %rd18, %rd60; mov.u64 %rd70, %rd69; @%p5 bra $L__BB6_7; mov.u64 %rd69, %rd11; mov.u64 %rd70, %rd12; $L__BB6_7: setp.le.u64 %p6, %rd31, %rd10; @%p6 bra $L__BB6_10; setp.eq.s64 %p7, %rd70, 0; @%p7 bra $L__BB6_10; cvta.to.global.u64 %rd61, %rd25; ld.global.u32 %r10, [%rd69]; mul.lo.s64 %rd62, %rd10, 48; add.s64 %rd63, %rd61, %rd62; st.global.u32 [%rd63+20], %r10; ld.global.u64 %rd64, [%rd69+24]; ld.global.u64 %rd65, [%rd69+32]; st.global.u64 [%rd63+24], %rd64; st.global.u64 [%rd63+32], %rd65; ld.global.u64 %rd66, [%rd69+40]; st.global.u64 [%rd63+40], %rd66; $L__BB6_10: ret; } // .globl touch_particle_blocks .visible .entry touch_particle_blocks( .param .u64 touch_particle_blocks_param_0, .param .u32 touch_particle_blocks_param_1, .param .align 8 .b8 touch_particle_blocks_param_2[72] ) { .local .align 8 .b8 __local_depot7[48]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<12>; .reg .f32 %f<14>; .reg .b32 %r<43>; .reg .b64 %rd<84>; mov.u64 %SPL, __local_depot7; ld.param.u64 %rd14, [touch_particle_blocks_param_0]; ld.param.u32 %r8, [touch_particle_blocks_param_1]; ld.param.u32 %r7, [touch_particle_blocks_param_2+40]; ld.param.u64 %rd18, [touch_particle_blocks_param_2+32]; ld.param.u64 %rd17, [touch_particle_blocks_param_2+24]; ld.param.u64 %rd16, [touch_particle_blocks_param_2+16]; ld.param.f32 %f1, [touch_particle_blocks_param_2]; mov.u32 %r9, %ntid.z; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ntid.x; mov.b64 %rd22, {%r11, %r10}; mov.u32 %r12, %ctaid.z; mov.u32 %r13, %nctaid.y; mov.u32 %r14, %ctaid.y; mad.lo.s32 %r15, %r12, %r13, %r14; mov.u32 %r16, %nctaid.x; mov.u32 %r17, %ctaid.x; mad.lo.s32 %r18, %r15, %r16, %r17; and.b64 %rd23, %rd22, 4294967295; cvt.u64.u32 %rd24, %r10; bfi.b64 %rd25, %rd24, %rd23, 32, 32; cvt.u64.u32 %rd26, %r9; mov.b64 {%r19, %r20}, %rd25; mov.b64 {%r21, %r22}, %rd26; mul.lo.s32 %r23, %r19, %r18; mul.lo.s32 %r24, %r23, %r20; mov.u32 %r25, %tid.z; mov.u32 %r26, %tid.y; mad.lo.s32 %r27, %r25, %r10, %r26; mov.u32 %r28, %tid.x; mad.lo.s32 %r29, %r27, %r11, %r28; mad.lo.s32 %r1, %r24, %r21, %r29; setp.ge.u32 %p1, %r1, %r8; @%p1 bra $L__BB7_11; cvta.to.global.u64 %rd27, %rd14; mul.wide.u32 %rd28, %r1, 8; add.s64 %rd29, %rd27, %rd28; ld.global.u32 %rd30, [%rd29]; ld.global.u32 %rd31, [%rd29+4]; bfi.b64 %rd32, %rd31, %rd30, 32, 32; mov.u64 %rd82, 0; cvt.u32.u64 %r30, %rd32; mov.b32 %f2, %r30; div.rn.f32 %f3, %f2, %f1; shr.u64 %rd34, %rd32, 32; cvt.u32.u64 %r31, %rd34; mov.b32 %f4, %r31; div.rn.f32 %f5, %f4, %f1; mov.b32 %r32, %f3; and.b32 %r33, %r32, -2147483648; or.b32 %r34, %r33, 1056964608; mov.b32 %f6, %r34; add.rz.f32 %f7, %f3, %f6; cvt.rzi.f32.f32 %f8, %f7; setp.gt.f32 %p2, %f8, 0f5EFFFFFF; max.f32 %f9, %f8, 0fDF000000; cvt.rzi.s64.f32 %rd35, %f9; setp.num.f32 %p3, %f8, %f8; mov.b32 %r35, %f5; and.b32 %r36, %r35, -2147483648; or.b32 %r37, %r36, 1056964608; mov.b32 %f10, %r37; add.rz.f32 %f11, %f5, %f10; cvt.rzi.f32.f32 %f12, %f11; setp.leu.f32 %p4, %f12, 0f5EFFFFFF; max.f32 %f13, %f12, 0fDF000000; cvt.rzi.s64.f32 %rd36, %f13; setp.num.f32 %p5, %f12, %f12; add.s64 %rd37, %rd35, 8589934590; shr.u64 %rd38, %rd37, 2; selp.b64 %rd39, 2305843011361177599, %rd38, %p2; selp.b64 %rd40, %rd39, 2147483647, %p3; shl.b64 %rd41, %rd36, 30; and.b64 %rd42, %rd40, 4294967295; add.s64 %rd43, %rd41, 9223372034707292160; and.b64 %rd44, %rd43, -4294967296; and.pred %p6, %p5, %p4; selp.b64 %rd45, %rd44, 9223372032559808512, %p6; or.b64 %rd46, %rd45, %rd42; add.s64 %rd47, %rd45, 4294967296; or.b64 %rd48, %rd47, %rd42; add.s64 %rd49, %rd40, 1; and.b64 %rd50, %rd49, 4294967295; or.b64 %rd51, %rd50, %rd45; or.b64 %rd52, %rd47, %rd50; add.u64 %rd1, %SPL, 0; st.local.u64 [%rd1], %rd46; st.local.u64 [%rd1+8], %rd48; st.local.u64 [%rd1+16], %rd51; st.local.u64 [%rd1+24], %rd52; st.local.u64 [%rd1+32], %rd82; mov.u64 %rd54, 4; st.local.u64 [%rd1+40], %rd54; add.s32 %r3, %r7, -1; setp.eq.s32 %p7, %r3, 0; @%p7 bra $L__BB7_9; cvt.u64.u32 %rd56, %r7; add.s64 %rd4, %rd56, -1; cvta.to.global.u64 %rd5, %rd16; mov.u32 %r38, 1; $L__BB7_3: shl.b64 %rd59, %rd82, 3; add.s64 %rd60, %rd1, %rd59; add.s64 %rd82, %rd82, 1; st.local.u64 [%rd1+32], %rd82; ld.local.u64 %rd8, [%rd60]; shr.u64 %rd61, %rd8, 16; xor.b64 %rd62, %rd61, %rd8; mul.lo.s64 %rd63, %rd62, 2246822507; shr.u64 %rd64, %rd63, 13; xor.b64 %rd65, %rd64, %rd63; mul.lo.s64 %rd66, %rd65, 3266489909; shr.u64 %rd67, %rd66, 16; xor.b64 %rd83, %rd67, %rd66; mov.u32 %r42, %r38; $L__BB7_4: and.b64 %rd11, %rd83, %rd4; shl.b64 %rd73, %rd11, 4; add.s64 %rd70, %rd18, %rd73; mov.u64 %rd71, -1; // begin inline asm cvta.to.global.u64 %rd68, %rd70;atom.global.cas.b64 %rd69, [%rd68], %rd71, %rd8; // end inline asm setp.eq.s64 %p8, %rd69, -1; @%p8 bra $L__BB7_7; setp.eq.s64 %p9, %rd69, %rd8; @%p9 bra $L__BB7_8; add.s64 %rd83, %rd11, 1; add.s32 %r5, %r42, 1; setp.lt.u32 %p10, %r42, %r3; mov.u32 %r42, %r5; @%p10 bra $L__BB7_4; bra.uni $L__BB7_8; $L__BB7_7: cvta.to.global.u64 %rd76, %rd18; mov.u32 %r40, 1; // begin inline asm cvta.to.global.u64 %rd74, %rd17;atom.global.add.u32 %r39, [%rd74], %r40; // end inline asm mul.wide.u32 %rd77, %r39, 24; add.s64 %rd78, %rd5, %rd77; st.global.u64 [%rd78], %rd8; mov.u32 %r41, 0; st.global.v2.u32 [%rd78+8], {%r41, %r41}; st.global.u32 [%rd78+16], %r41; add.s64 %rd80, %rd76, %rd73; st.global.u32 [%rd80+8], %r39; $L__BB7_8: setp.lt.u64 %p11, %rd82, 4; @%p11 bra $L__BB7_3; bra.uni $L__BB7_11; $L__BB7_9: st.local.u64 [%rd1+32], %rd54; $L__BB7_11: ret; } // .globl tag_halo_blocks .visible .entry tag_halo_blocks( .param .align 8 .b8 tag_halo_blocks_param_0[72], .param .u64 tag_halo_blocks_param_1, .param .u32 tag_halo_blocks_param_2, .param .u64 tag_halo_blocks_param_3 ) { .reg .pred %p<7>; .reg .f32 %f<2>; .reg .b32 %r<31>; .reg .b64 %rd<51>; ld.param.u64 %rd17, [tag_halo_blocks_param_1]; ld.param.u32 %r4, [tag_halo_blocks_param_2]; ld.param.u64 %rd18, [tag_halo_blocks_param_3]; ld.param.u32 %r3, [tag_halo_blocks_param_0+40]; ld.param.u64 %rd13, [tag_halo_blocks_param_0+32]; ld.param.u64 %rd11, [tag_halo_blocks_param_0+16]; mov.u32 %r5, %ntid.z; mov.u32 %r6, %ntid.y; mov.u32 %r7, %ntid.x; mov.b64 %rd19, {%r7, %r6}; mov.u32 %r8, %ctaid.z; mov.u32 %r9, %nctaid.y; mov.u32 %r10, %ctaid.y; mad.lo.s32 %r11, %r8, %r9, %r10; mov.u32 %r12, %nctaid.x; mov.u32 %r13, %ctaid.x; mad.lo.s32 %r14, %r11, %r12, %r13; and.b64 %rd20, %rd19, 4294967295; cvt.u64.u32 %rd21, %r6; bfi.b64 %rd22, %rd21, %rd20, 32, 32; cvt.u64.u32 %rd23, %r5; mov.b64 {%r15, %r16}, %rd22; mov.b64 {%r17, %r18}, %rd23; mul.lo.s32 %r19, %r15, %r14; mul.lo.s32 %r20, %r19, %r16; mov.u32 %r21, %tid.z; mov.u32 %r22, %tid.y; mad.lo.s32 %r23, %r21, %r6, %r22; mov.u32 %r24, %tid.x; mad.lo.s32 %r25, %r23, %r7, %r24; mad.lo.s32 %r1, %r20, %r17, %r25; setp.ge.u32 %p1, %r1, %r4; @%p1 bra $L__BB8_8; cvta.to.global.u64 %rd24, %rd17; cvta.to.global.u64 %rd1, %rd13; mul.wide.u32 %rd25, %r1, 24; add.s64 %rd26, %rd24, %rd25; ld.global.u64 %rd2, [%rd26]; shr.u64 %rd27, %rd2, 16; xor.b64 %rd28, %rd27, %rd2; mul.lo.s64 %rd29, %rd28, 2246822507; shr.u64 %rd30, %rd29, 13; xor.b64 %rd31, %rd30, %rd29; mul.lo.s64 %rd32, %rd31, 3266489909; shr.u64 %rd33, %rd32, 16; xor.b64 %rd34, %rd33, %rd32; cvt.u64.u32 %rd35, %r3; add.s64 %rd3, %rd35, -1; and.b64 %rd49, %rd34, %rd3; shl.b64 %rd36, %rd49, 4; add.s64 %rd37, %rd1, %rd36; ld.global.u64 %rd5, [%rd37]; setp.eq.s64 %p2, %rd5, %rd2; @%p2 bra $L__BB8_6; setp.eq.s64 %p3, %rd5, -1; @%p3 bra $L__BB8_8; $L__BB8_4: add.s64 %rd38, %rd49, 1; and.b64 %rd49, %rd38, %rd3; shl.b64 %rd39, %rd49, 4; add.s64 %rd40, %rd1, %rd39; ld.global.u64 %rd8, [%rd40]; setp.eq.s64 %p4, %rd8, %rd2; @%p4 bra $L__BB8_6; setp.eq.s64 %p5, %rd8, -1; @%p5 bra $L__BB8_8; bra.uni $L__BB8_4; $L__BB8_6: shl.b64 %rd43, %rd49, 4; add.s64 %rd44, %rd1, %rd43; ld.global.u32 %r28, [%rd44+8]; mul.wide.u32 %rd45, %r28, 24; add.s64 %rd46, %rd11, %rd45; add.s64 %rd42, %rd46, 16; mov.u32 %r27, 1; // begin inline asm cvta.to.global.u64 %rd41, %rd42;atom.global.exch.b32 %r26, [%rd41], %r27; // end inline asm setp.ne.s32 %p6, %r26, 0; @%p6 bra $L__BB8_8; // begin inline asm cvta.to.global.u64 %rd47, %rd18;atom.global.add.u32 %r29, [%rd47], %r27; // end inline asm $L__BB8_8: ret; } // .globl tag_halo_neighbors .visible .entry tag_halo_neighbors( .param .align 8 .b8 tag_halo_neighbors_param_0[72], .param .u32 tag_halo_neighbors_param_1 ) { .reg .pred %p<18>; .reg .f32 %f<2>; .reg .b32 %r<37>; .reg .b64 %rd<104>; ld.param.u32 %r4, [tag_halo_neighbors_param_1]; ld.param.u32 %r3, [tag_halo_neighbors_param_0+40]; ld.param.u64 %rd29, [tag_halo_neighbors_param_0+32]; ld.param.u64 %rd27, [tag_halo_neighbors_param_0+16]; cvta.to.global.u64 %rd1, %rd29; mov.u32 %r5, %ntid.z; mov.u32 %r6, %ntid.y; mov.u32 %r7, %ntid.x; mov.b64 %rd33, {%r7, %r6}; mov.u32 %r8, %ctaid.z; mov.u32 %r9, %nctaid.y; mov.u32 %r10, %ctaid.y; mad.lo.s32 %r11, %r8, %r9, %r10; mov.u32 %r12, %nctaid.x; mov.u32 %r13, %ctaid.x; mad.lo.s32 %r14, %r11, %r12, %r13; and.b64 %rd34, %rd33, 4294967295; cvt.u64.u32 %rd35, %r6; bfi.b64 %rd36, %rd35, %rd34, 32, 32; cvt.u64.u32 %rd37, %r5; mov.b64 {%r15, %r16}, %rd36; mov.b64 {%r17, %r18}, %rd37; mul.lo.s32 %r19, %r15, %r14; mul.lo.s32 %r20, %r19, %r16; mov.u32 %r21, %tid.z; mov.u32 %r22, %tid.y; mad.lo.s32 %r23, %r21, %r6, %r22; mov.u32 %r24, %tid.x; mad.lo.s32 %r25, %r23, %r7, %r24; mad.lo.s32 %r1, %r20, %r17, %r25; setp.ge.u32 %p1, %r1, %r4; @%p1 bra $L__BB9_20; cvta.to.global.u64 %rd2, %rd27; mul.wide.u32 %rd38, %r1, 24; add.s64 %rd39, %rd2, %rd38; add.s64 %rd3, %rd39, 16; ld.global.u32 %r26, [%rd39+16]; and.b32 %r27, %r26, 1; setp.eq.b32 %p2, %r27, 1; mov.pred %p3, 0; xor.pred %p4, %p2, %p3; not.pred %p5, %p4; @%p5 bra $L__BB9_20; ld.global.u64 %rd40, [%rd3+-16]; and.b64 %rd41, %rd40, 4294967295; and.b64 %rd42, %rd40, -4294967296; add.s64 %rd43, %rd42, -4294967296; or.b64 %rd4, %rd43, %rd41; add.s64 %rd44, %rd40, -1; and.b64 %rd45, %rd44, 4294967295; or.b64 %rd5, %rd45, %rd42; or.b64 %rd6, %rd45, %rd43; cvt.u64.u32 %rd46, %r3; add.s64 %rd7, %rd46, -1; shr.u64 %rd47, %rd4, 16; xor.b64 %rd48, %rd47, %rd4; mul.lo.s64 %rd49, %rd48, 2246822507; shr.u64 %rd50, %rd49, 13; xor.b64 %rd51, %rd50, %rd49; mul.lo.s64 %rd52, %rd51, 3266489909; shr.u64 %rd53, %rd52, 16; xor.b64 %rd54, %rd53, %rd52; and.b64 %rd98, %rd54, %rd7; shl.b64 %rd55, %rd98, 4; add.s64 %rd56, %rd1, %rd55; ld.global.u64 %rd9, [%rd56]; setp.eq.s64 %p6, %rd9, %rd4; @%p6 bra $L__BB9_7; setp.eq.s64 %p7, %rd9, -1; @%p7 bra $L__BB9_8; $L__BB9_5: add.s64 %rd57, %rd98, 1; and.b64 %rd98, %rd57, %rd7; shl.b64 %rd58, %rd98, 4; add.s64 %rd59, %rd1, %rd58; ld.global.u64 %rd12, [%rd59]; setp.eq.s64 %p8, %rd12, %rd4; @%p8 bra $L__BB9_7; setp.eq.s64 %p9, %rd12, -1; @%p9 bra $L__BB9_8; bra.uni $L__BB9_5; $L__BB9_7: shl.b64 %rd60, %rd98, 4; add.s64 %rd61, %rd1, %rd60; ld.global.u32 %r28, [%rd61+8]; mul.wide.u32 %rd62, %r28, 24; add.s64 %rd63, %rd2, %rd62; ld.global.u32 %r29, [%rd63+16]; or.b32 %r30, %r29, 2; st.global.u32 [%rd63+16], %r30; $L__BB9_8: shr.u64 %rd64, %rd5, 16; xor.b64 %rd65, %rd64, %rd5; mul.lo.s64 %rd66, %rd65, 2246822507; shr.u64 %rd67, %rd66, 13; xor.b64 %rd68, %rd67, %rd66; mul.lo.s64 %rd69, %rd68, 3266489909; shr.u64 %rd70, %rd69, 16; xor.b64 %rd71, %rd70, %rd69; and.b64 %rd100, %rd71, %rd7; shl.b64 %rd72, %rd100, 4; add.s64 %rd73, %rd1, %rd72; ld.global.u64 %rd15, [%rd73]; setp.eq.s64 %p10, %rd15, %rd5; @%p10 bra $L__BB9_13; setp.eq.s64 %p11, %rd15, -1; @%p11 bra $L__BB9_14; $L__BB9_11: add.s64 %rd74, %rd100, 1; and.b64 %rd100, %rd74, %rd7; shl.b64 %rd75, %rd100, 4; add.s64 %rd76, %rd1, %rd75; ld.global.u64 %rd18, [%rd76]; setp.eq.s64 %p12, %rd18, %rd5; @%p12 bra $L__BB9_13; setp.eq.s64 %p13, %rd18, -1; @%p13 bra $L__BB9_14; bra.uni $L__BB9_11; $L__BB9_13: shl.b64 %rd77, %rd100, 4; add.s64 %rd78, %rd1, %rd77; ld.global.u32 %r31, [%rd78+8]; mul.wide.u32 %rd79, %r31, 24; add.s64 %rd80, %rd2, %rd79; ld.global.u32 %r32, [%rd80+16]; or.b32 %r33, %r32, 2; st.global.u32 [%rd80+16], %r33; $L__BB9_14: shr.u64 %rd81, %rd6, 16; xor.b64 %rd82, %rd81, %rd6; mul.lo.s64 %rd83, %rd82, 2246822507; shr.u64 %rd84, %rd83, 13; xor.b64 %rd85, %rd84, %rd83; mul.lo.s64 %rd86, %rd85, 3266489909; shr.u64 %rd87, %rd86, 16; xor.b64 %rd88, %rd87, %rd86; and.b64 %rd102, %rd88, %rd7; shl.b64 %rd89, %rd102, 4; add.s64 %rd90, %rd1, %rd89; ld.global.u64 %rd21, [%rd90]; setp.eq.s64 %p14, %rd21, %rd6; @%p14 bra $L__BB9_19; setp.eq.s64 %p15, %rd21, -1; @%p15 bra $L__BB9_20; $L__BB9_17: add.s64 %rd91, %rd102, 1; and.b64 %rd102, %rd91, %rd7; shl.b64 %rd92, %rd102, 4; add.s64 %rd93, %rd1, %rd92; ld.global.u64 %rd24, [%rd93]; setp.eq.s64 %p16, %rd24, %rd6; @%p16 bra $L__BB9_19; setp.eq.s64 %p17, %rd24, -1; @%p17 bra $L__BB9_20; bra.uni $L__BB9_17; $L__BB9_19: shl.b64 %rd94, %rd102, 4; add.s64 %rd95, %rd1, %rd94; ld.global.u32 %r34, [%rd95+8]; mul.wide.u32 %rd96, %r34, 24; add.s64 %rd97, %rd2, %rd96; ld.global.u32 %r35, [%rd97+16]; or.b32 %r36, %r35, 2; st.global.u32 [%rd97+16], %r36; $L__BB9_20: ret; } // .globl copy_halo_to_staging .visible .entry copy_halo_to_staging( .param .align 8 .b8 copy_halo_to_staging_param_0[72], .param .u64 copy_halo_to_staging_param_1, .param .u64 copy_halo_to_staging_param_2 ) { .reg .pred %p<6>; .reg .f32 %f<256>; .reg .b32 %r<31>; .reg .b64 %rd<62>; ld.param.u64 %rd10, [copy_halo_to_staging_param_1]; ld.param.u64 %rd11, [copy_halo_to_staging_param_2]; ld.param.u64 %rd5, [copy_halo_to_staging_param_0+24]; ld.param.u64 %rd4, [copy_halo_to_staging_param_0+16]; ld.param.u64 %rd3, [copy_halo_to_staging_param_0+8]; cvta.to.global.u64 %rd12, %rd5; mov.u32 %r4, %ntid.z; mov.u32 %r5, %ntid.y; mov.u32 %r6, %ntid.x; mov.b64 %rd13, {%r6, %r5}; mov.u32 %r7, %ctaid.z; mov.u32 %r8, %nctaid.y; mov.u32 %r9, %ctaid.y; mad.lo.s32 %r10, %r7, %r8, %r9; mov.u32 %r11, %nctaid.x; mov.u32 %r12, %ctaid.x; mad.lo.s32 %r13, %r10, %r11, %r12; and.b64 %rd14, %rd13, 4294967295; cvt.u64.u32 %rd15, %r5; bfi.b64 %rd16, %rd15, %rd14, 32, 32; cvt.u64.u32 %rd17, %r4; mov.b64 {%r14, %r15}, %rd16; mov.b64 {%r16, %r17}, %rd17; mul.lo.s32 %r18, %r14, %r13; mul.lo.s32 %r19, %r18, %r15; mov.u32 %r20, %tid.z; mov.u32 %r21, %tid.y; mad.lo.s32 %r22, %r20, %r5, %r21; mov.u32 %r23, %tid.x; mad.lo.s32 %r24, %r22, %r6, %r23; mad.lo.s32 %r1, %r19, %r16, %r24; ld.global.u32 %r25, [%rd12]; setp.ge.u32 %p1, %r1, %r25; @%p1 bra $L__BB10_3; cvta.to.global.u64 %rd18, %rd4; cvt.u64.u32 %rd1, %r1; mul.wide.u32 %rd19, %r1, 24; add.s64 %rd20, %rd18, %rd19; add.s64 %rd2, %rd20, 16; ld.global.u32 %r26, [%rd20+16]; and.b32 %r27, %r26, 1; setp.eq.b32 %p2, %r27, 1; mov.pred %p3, 0; xor.pred %p4, %p2, %p3; not.pred %p5, %p4; @%p5 bra $L__BB10_3; mov.u32 %r29, -1; // begin inline asm cvta.to.global.u64 %rd21, %rd11;atom.global.dec.u32 %r28, [%rd21], %r29; // end inline asm add.s32 %r30, %r28, -1; cvta.to.global.u64 %rd23, %rd10; mul.wide.u32 %rd24, %r30, 776; add.s64 %rd25, %rd23, %rd24; ld.global.u64 %rd26, [%rd2+-16]; st.global.u64 [%rd25], %rd26; mul.lo.s64 %rd27, %rd1, 768; cvta.to.global.u64 %rd28, %rd3; add.s64 %rd29, %rd28, %rd27; ld.global.v2.f32 {%f2, %f3}, [%rd29]; ld.global.v2.f32 {%f6, %f7}, [%rd29+8]; ld.global.v2.f32 {%f10, %f11}, [%rd29+16]; ld.global.u64 %rd30, [%rd29+24]; ld.global.u64 %rd31, [%rd29+32]; ld.global.v2.f32 {%f14, %f15}, [%rd29+40]; st.global.v2.f32 [%rd25+8], {%f2, %f3}; st.global.v2.f32 [%rd25+16], {%f6, %f7}; st.global.v2.f32 [%rd25+24], {%f10, %f11}; st.global.u64 [%rd25+32], %rd30; st.global.u64 [%rd25+40], %rd31; st.global.v2.f32 [%rd25+48], {%f14, %f15}; ld.global.v2.f32 {%f18, %f19}, [%rd29+48]; ld.global.v2.f32 {%f22, %f23}, [%rd29+56]; ld.global.v2.f32 {%f26, %f27}, [%rd29+64]; ld.global.u64 %rd32, [%rd29+72]; ld.global.u64 %rd33, [%rd29+80]; ld.global.v2.f32 {%f30, %f31}, [%rd29+88]; st.global.v2.f32 [%rd25+56], {%f18, %f19}; st.global.v2.f32 [%rd25+64], {%f22, %f23}; st.global.v2.f32 [%rd25+72], {%f26, %f27}; st.global.u64 [%rd25+80], %rd32; st.global.u64 [%rd25+88], %rd33; st.global.v2.f32 [%rd25+96], {%f30, %f31}; ld.global.v2.f32 {%f34, %f35}, [%rd29+96]; ld.global.v2.f32 {%f38, %f39}, [%rd29+104]; ld.global.v2.f32 {%f42, %f43}, [%rd29+112]; ld.global.u64 %rd34, [%rd29+120]; ld.global.u64 %rd35, [%rd29+128]; ld.global.v2.f32 {%f46, %f47}, [%rd29+136]; st.global.v2.f32 [%rd25+104], {%f34, %f35}; st.global.v2.f32 [%rd25+112], {%f38, %f39}; st.global.v2.f32 [%rd25+120], {%f42, %f43}; st.global.u64 [%rd25+128], %rd34; st.global.u64 [%rd25+136], %rd35; st.global.v2.f32 [%rd25+144], {%f46, %f47}; ld.global.v2.f32 {%f50, %f51}, [%rd29+144]; ld.global.v2.f32 {%f54, %f55}, [%rd29+152]; ld.global.v2.f32 {%f58, %f59}, [%rd29+160]; ld.global.u64 %rd36, [%rd29+168]; ld.global.u64 %rd37, [%rd29+176]; ld.global.v2.f32 {%f62, %f63}, [%rd29+184]; st.global.v2.f32 [%rd25+152], {%f50, %f51}; st.global.v2.f32 [%rd25+160], {%f54, %f55}; st.global.v2.f32 [%rd25+168], {%f58, %f59}; st.global.u64 [%rd25+176], %rd36; st.global.u64 [%rd25+184], %rd37; st.global.v2.f32 [%rd25+192], {%f62, %f63}; ld.global.v2.f32 {%f66, %f67}, [%rd29+192]; ld.global.v2.f32 {%f70, %f71}, [%rd29+200]; ld.global.v2.f32 {%f74, %f75}, [%rd29+208]; ld.global.u64 %rd38, [%rd29+216]; ld.global.u64 %rd39, [%rd29+224]; ld.global.v2.f32 {%f78, %f79}, [%rd29+232]; st.global.v2.f32 [%rd25+200], {%f66, %f67}; st.global.v2.f32 [%rd25+208], {%f70, %f71}; st.global.v2.f32 [%rd25+216], {%f74, %f75}; st.global.u64 [%rd25+224], %rd38; st.global.u64 [%rd25+232], %rd39; st.global.v2.f32 [%rd25+240], {%f78, %f79}; ld.global.v2.f32 {%f82, %f83}, [%rd29+240]; ld.global.v2.f32 {%f86, %f87}, [%rd29+248]; ld.global.v2.f32 {%f90, %f91}, [%rd29+256]; ld.global.u64 %rd40, [%rd29+264]; ld.global.u64 %rd41, [%rd29+272]; ld.global.v2.f32 {%f94, %f95}, [%rd29+280]; st.global.v2.f32 [%rd25+248], {%f82, %f83}; st.global.v2.f32 [%rd25+256], {%f86, %f87}; st.global.v2.f32 [%rd25+264], {%f90, %f91}; st.global.u64 [%rd25+272], %rd40; st.global.u64 [%rd25+280], %rd41; st.global.v2.f32 [%rd25+288], {%f94, %f95}; ld.global.v2.f32 {%f98, %f99}, [%rd29+288]; ld.global.f32 %f102, [%rd29+296]; ld.global.f32 %f103, [%rd29+300]; ld.global.v2.f32 {%f104, %f105}, [%rd29+304]; ld.global.u64 %rd42, [%rd29+312]; ld.global.u64 %rd43, [%rd29+320]; ld.global.v2.f32 {%f108, %f109}, [%rd29+328]; st.global.v2.f32 [%rd25+296], {%f98, %f99}; st.global.f32 [%rd25+304], %f102; st.global.f32 [%rd25+308], %f103; st.global.f32 [%rd25+312], %f104; st.global.f32 [%rd25+316], %f105; st.global.u64 [%rd25+320], %rd42; st.global.u64 [%rd25+328], %rd43; st.global.v2.f32 [%rd25+336], {%f108, %f109}; ld.global.v2.f32 {%f112, %f113}, [%rd29+336]; ld.global.v2.f32 {%f116, %f117}, [%rd29+344]; ld.global.v2.f32 {%f120, %f121}, [%rd29+352]; ld.global.u64 %rd44, [%rd29+360]; ld.global.u64 %rd45, [%rd29+368]; ld.global.v2.f32 {%f124, %f125}, [%rd29+376]; st.global.v2.f32 [%rd25+344], {%f112, %f113}; st.global.v2.f32 [%rd25+352], {%f116, %f117}; st.global.v2.f32 [%rd25+360], {%f120, %f121}; st.global.u64 [%rd25+368], %rd44; st.global.u64 [%rd25+376], %rd45; st.global.v2.f32 [%rd25+384], {%f124, %f125}; ld.global.v2.f32 {%f128, %f129}, [%rd29+384]; ld.global.v2.f32 {%f132, %f133}, [%rd29+392]; ld.global.v2.f32 {%f136, %f137}, [%rd29+400]; ld.global.u64 %rd46, [%rd29+408]; ld.global.u64 %rd47, [%rd29+416]; ld.global.v2.f32 {%f140, %f141}, [%rd29+424]; st.global.v2.f32 [%rd25+392], {%f128, %f129}; st.global.v2.f32 [%rd25+400], {%f132, %f133}; st.global.v2.f32 [%rd25+408], {%f136, %f137}; st.global.u64 [%rd25+416], %rd46; st.global.u64 [%rd25+424], %rd47; st.global.v2.f32 [%rd25+432], {%f140, %f141}; ld.global.v2.f32 {%f144, %f145}, [%rd29+432]; ld.global.v2.f32 {%f148, %f149}, [%rd29+440]; ld.global.v2.f32 {%f152, %f153}, [%rd29+448]; ld.global.u64 %rd48, [%rd29+456]; ld.global.u64 %rd49, [%rd29+464]; ld.global.v2.f32 {%f156, %f157}, [%rd29+472]; st.global.v2.f32 [%rd25+440], {%f144, %f145}; st.global.v2.f32 [%rd25+448], {%f148, %f149}; st.global.v2.f32 [%rd25+456], {%f152, %f153}; st.global.u64 [%rd25+464], %rd48; st.global.u64 [%rd25+472], %rd49; st.global.v2.f32 [%rd25+480], {%f156, %f157}; ld.global.v2.f32 {%f160, %f161}, [%rd29+480]; ld.global.v2.f32 {%f164, %f165}, [%rd29+488]; ld.global.v2.f32 {%f168, %f169}, [%rd29+496]; ld.global.u64 %rd50, [%rd29+504]; ld.global.u64 %rd51, [%rd29+512]; ld.global.v2.f32 {%f172, %f173}, [%rd29+520]; st.global.v2.f32 [%rd25+488], {%f160, %f161}; st.global.v2.f32 [%rd25+496], {%f164, %f165}; st.global.v2.f32 [%rd25+504], {%f168, %f169}; st.global.u64 [%rd25+512], %rd50; st.global.u64 [%rd25+520], %rd51; st.global.v2.f32 [%rd25+528], {%f172, %f173}; ld.global.v2.f32 {%f176, %f177}, [%rd29+528]; ld.global.v2.f32 {%f180, %f181}, [%rd29+536]; ld.global.v2.f32 {%f184, %f185}, [%rd29+544]; ld.global.u64 %rd52, [%rd29+552]; ld.global.u64 %rd53, [%rd29+560]; ld.global.v2.f32 {%f188, %f189}, [%rd29+568]; st.global.v2.f32 [%rd25+536], {%f176, %f177}; st.global.v2.f32 [%rd25+544], {%f180, %f181}; st.global.v2.f32 [%rd25+552], {%f184, %f185}; st.global.u64 [%rd25+560], %rd52; st.global.u64 [%rd25+568], %rd53; st.global.v2.f32 [%rd25+576], {%f188, %f189}; ld.global.v2.f32 {%f192, %f193}, [%rd29+576]; ld.global.v2.f32 {%f196, %f197}, [%rd29+584]; ld.global.v2.f32 {%f200, %f201}, [%rd29+592]; ld.global.u64 %rd54, [%rd29+600]; ld.global.u64 %rd55, [%rd29+608]; ld.global.v2.f32 {%f204, %f205}, [%rd29+616]; st.global.v2.f32 [%rd25+584], {%f192, %f193}; st.global.v2.f32 [%rd25+592], {%f196, %f197}; st.global.v2.f32 [%rd25+600], {%f200, %f201}; st.global.u64 [%rd25+608], %rd54; st.global.u64 [%rd25+616], %rd55; st.global.v2.f32 [%rd25+624], {%f204, %f205}; ld.global.v2.f32 {%f208, %f209}, [%rd29+624]; ld.global.v2.f32 {%f212, %f213}, [%rd29+632]; ld.global.v2.f32 {%f216, %f217}, [%rd29+640]; ld.global.u64 %rd56, [%rd29+648]; ld.global.u64 %rd57, [%rd29+656]; ld.global.v2.f32 {%f220, %f221}, [%rd29+664]; st.global.v2.f32 [%rd25+632], {%f208, %f209}; st.global.v2.f32 [%rd25+640], {%f212, %f213}; st.global.v2.f32 [%rd25+648], {%f216, %f217}; st.global.u64 [%rd25+656], %rd56; st.global.u64 [%rd25+664], %rd57; st.global.v2.f32 [%rd25+672], {%f220, %f221}; ld.global.v2.f32 {%f224, %f225}, [%rd29+672]; ld.global.v2.f32 {%f228, %f229}, [%rd29+680]; ld.global.v2.f32 {%f232, %f233}, [%rd29+688]; ld.global.u64 %rd58, [%rd29+696]; ld.global.u64 %rd59, [%rd29+704]; ld.global.v2.f32 {%f236, %f237}, [%rd29+712]; st.global.v2.f32 [%rd25+680], {%f224, %f225}; st.global.v2.f32 [%rd25+688], {%f228, %f229}; st.global.v2.f32 [%rd25+696], {%f232, %f233}; st.global.u64 [%rd25+704], %rd58; st.global.u64 [%rd25+712], %rd59; st.global.v2.f32 [%rd25+720], {%f236, %f237}; ld.global.v2.f32 {%f240, %f241}, [%rd29+720]; ld.global.v2.f32 {%f244, %f245}, [%rd29+728]; ld.global.v2.f32 {%f248, %f249}, [%rd29+736]; ld.global.u64 %rd60, [%rd29+744]; ld.global.u64 %rd61, [%rd29+752]; ld.global.v2.f32 {%f252, %f253}, [%rd29+760]; st.global.v2.f32 [%rd25+728], {%f240, %f241}; st.global.v2.f32 [%rd25+736], {%f244, %f245}; st.global.v2.f32 [%rd25+744], {%f248, %f249}; st.global.u64 [%rd25+752], %rd60; st.global.u64 [%rd25+760], %rd61; st.global.v2.f32 [%rd25+768], {%f252, %f253}; $L__BB10_3: ret; } // .globl merge_halo_blocks .visible .entry merge_halo_blocks( .param .align 8 .b8 merge_halo_blocks_param_0[72], .param .u64 merge_halo_blocks_param_1 ) { .reg .pred %p<7>; .reg .f32 %f<2>; .reg .b32 %r<13>; .reg .b64 %rd<66>; ld.param.u64 %rd21, [merge_halo_blocks_param_1]; ld.param.u64 %rd20, [merge_halo_blocks_param_0+64]; ld.param.u32 %r2, [merge_halo_blocks_param_0+40]; ld.param.u64 %rd17, [merge_halo_blocks_param_0+32]; ld.param.u64 %rd14, [merge_halo_blocks_param_0+8]; cvta.to.global.u64 %rd22, %rd21; cvta.to.global.u64 %rd1, %rd17; mov.u32 %r3, %ctaid.x; mul.wide.u32 %rd23, %r3, 776; add.s64 %rd24, %rd22, %rd23; ld.global.u64 %rd2, [%rd24]; shr.u64 %rd25, %rd2, 16; xor.b64 %rd26, %rd25, %rd2; mul.lo.s64 %rd27, %rd26, 2246822507; shr.u64 %rd28, %rd27, 13; xor.b64 %rd29, %rd28, %rd27; mul.lo.s64 %rd30, %rd29, 3266489909; shr.u64 %rd31, %rd30, 16; xor.b64 %rd32, %rd31, %rd30; cvt.u64.u32 %rd33, %r2; add.s64 %rd3, %rd33, -1; and.b64 %rd64, %rd32, %rd3; shl.b64 %rd34, %rd64, 4; add.s64 %rd35, %rd1, %rd34; ld.global.u64 %rd5, [%rd35]; setp.eq.s64 %p1, %rd5, %rd2; @%p1 bra $L__BB11_5; setp.eq.s64 %p2, %rd5, -1; @%p2 bra $L__BB11_10; $L__BB11_3: add.s64 %rd36, %rd64, 1; and.b64 %rd64, %rd36, %rd3; shl.b64 %rd37, %rd64, 4; add.s64 %rd38, %rd1, %rd37; ld.global.u64 %rd8, [%rd38]; setp.eq.s64 %p3, %rd8, %rd2; @%p3 bra $L__BB11_5; setp.eq.s64 %p4, %rd8, -1; @%p4 bra $L__BB11_10; bra.uni $L__BB11_3; $L__BB11_5: shl.b64 %rd39, %rd64, 4; add.s64 %rd40, %rd1, %rd39; ld.global.u32 %r4, [%rd40+8]; mul.wide.u32 %rd41, %r4, 16; mov.u32 %r5, %tid.x; cvt.u64.u32 %rd10, %r5; add.s64 %rd11, %rd41, %rd10; setp.gt.u64 %p5, %rd20, %rd11; @%p5 bra $L__BB11_7; bra.uni $L__BB11_6; $L__BB11_7: mul.lo.s64 %rd42, %rd11, 48; add.s64 %rd13, %rd14, %rd42; setp.lt.u32 %p6, %r5, 16; @%p6 bra $L__BB11_9; bra.uni $L__BB11_8; $L__BB11_9: mul.lo.s64 %rd56, %rd10, 48; add.s64 %rd57, %rd24, %rd56; ld.global.u32 %r7, [%rd57+8]; // begin inline asm cvta.to.global.u64 %rd43, %rd13;red.global.add.f32 [%rd43], %r7; // end inline asm add.s64 %rd46, %rd13, 4; ld.global.u32 %rd60, [%rd57+12]; ld.global.u32 %rd61, [%rd57+16]; bfi.b64 %rd62, %rd61, %rd60, 32, 32; cvt.u32.u64 %r8, %rd62; shr.u64 %rd63, %rd62, 32; cvt.u32.u64 %r9, %rd63; // begin inline asm cvta.to.global.u64 %rd45, %rd46;red.global.add.f32 [%rd45], %r8; // end inline asm add.s64 %rd48, %rd13, 8; // begin inline asm cvta.to.global.u64 %rd47, %rd48;red.global.add.f32 [%rd47], %r9; // end inline asm add.s64 %rd50, %rd13, 16; ld.global.u32 %r10, [%rd57+24]; // begin inline asm cvta.to.global.u64 %rd49, %rd50;red.global.add.f32 [%rd49], %r10; // end inline asm add.s64 %rd52, %rd13, 12; ld.global.u32 %r11, [%rd57+20]; // begin inline asm cvta.to.global.u64 %rd51, %rd52;red.global.add.f32 [%rd51], %r11; // end inline asm $L__BB11_10: ret; $L__BB11_6: trap; $L__BB11_8: trap; } // .globl update_block_particle_count .visible .entry update_block_particle_count( .param .u64 update_block_particle_count_param_0, .param .u32 update_block_particle_count_param_1, .param .align 8 .b8 update_block_particle_count_param_2[72] ) { .reg .pred %p<13>; .reg .f32 %f<15>; .reg .b32 %r<36>; .reg .b64 %rd<62>; ld.param.u64 %rd11, [update_block_particle_count_param_0]; ld.param.u32 %r4, [update_block_particle_count_param_1]; ld.param.u32 %r3, [update_block_particle_count_param_2+40]; ld.param.u64 %rd15, [update_block_particle_count_param_2+32]; ld.param.u64 %rd13, [update_block_particle_count_param_2+16]; ld.param.f32 %f2, [update_block_particle_count_param_2]; mov.u32 %r5, %ntid.z; mov.u32 %r6, %ntid.y; mov.u32 %r7, %ntid.x; mov.b64 %rd19, {%r7, %r6}; mov.u32 %r8, %ctaid.z; mov.u32 %r9, %nctaid.y; mov.u32 %r10, %ctaid.y; mad.lo.s32 %r11, %r8, %r9, %r10; mov.u32 %r12, %nctaid.x; mov.u32 %r13, %ctaid.x; mad.lo.s32 %r14, %r11, %r12, %r13; and.b64 %rd20, %rd19, 4294967295; cvt.u64.u32 %rd21, %r6; bfi.b64 %rd22, %rd21, %rd20, 32, 32; cvt.u64.u32 %rd23, %r5; mov.b64 {%r15, %r16}, %rd22; mov.b64 {%r17, %r18}, %rd23; mul.lo.s32 %r19, %r15, %r14; mul.lo.s32 %r20, %r19, %r16; mov.u32 %r21, %tid.z; mov.u32 %r22, %tid.y; mad.lo.s32 %r23, %r21, %r6, %r22; mov.u32 %r24, %tid.x; mad.lo.s32 %r25, %r23, %r7, %r24; mad.lo.s32 %r1, %r20, %r17, %r25; setp.ge.u32 %p1, %r1, %r4; @%p1 bra $L__BB12_8; cvta.to.global.u64 %rd24, %rd11; cvta.to.global.u64 %rd1, %rd15; mul.wide.u32 %rd25, %r1, 8; add.s64 %rd26, %rd24, %rd25; ld.global.u32 %rd27, [%rd26]; ld.global.u32 %rd28, [%rd26+4]; bfi.b64 %rd29, %rd28, %rd27, 32, 32; cvt.u32.u64 %r26, %rd29; mov.b32 %f3, %r26; div.rn.f32 %f4, %f3, %f2; shr.u64 %rd30, %rd29, 32; cvt.u32.u64 %r27, %rd30; mov.b32 %f5, %r27; div.rn.f32 %f6, %f5, %f2; mov.b32 %r28, %f4; and.b32 %r29, %r28, -2147483648; or.b32 %r30, %r29, 1056964608; mov.b32 %f7, %r30; add.rz.f32 %f8, %f4, %f7; cvt.rzi.f32.f32 %f9, %f8; setp.leu.f32 %p2, %f9, 0f5EFFFFFF; max.f32 %f10, %f9, 0fDF000000; cvt.rzi.s64.f32 %rd31, %f10; setp.num.f32 %p3, %f9, %f9; mov.b32 %r31, %f6; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r32, 1056964608; mov.b32 %f11, %r33; add.rz.f32 %f12, %f6, %f11; cvt.rzi.f32.f32 %f13, %f12; setp.leu.f32 %p4, %f13, 0f5EFFFFFF; max.f32 %f14, %f13, 0fDF000000; cvt.rzi.s64.f32 %rd32, %f14; setp.num.f32 %p5, %f13, %f13; add.s64 %rd33, %rd31, 8589934590; shr.u64 %rd34, %rd33, 2; shl.b64 %rd35, %rd32, 30; and.b64 %rd36, %rd34, 4294967295; and.pred %p6, %p3, %p2; selp.b64 %rd37, %rd36, 2147483647, %p6; add.s64 %rd38, %rd35, 9223372034707292160; and.b64 %rd39, %rd38, -4294967296; and.pred %p7, %p5, %p4; selp.b64 %rd40, %rd39, 9223372032559808512, %p7; or.b64 %rd2, %rd40, %rd37; shr.u64 %rd41, %rd2, 16; xor.b64 %rd42, %rd41, %rd2; mul.lo.s64 %rd43, %rd42, 2246822507; shr.u64 %rd44, %rd43, 13; xor.b64 %rd45, %rd44, %rd43; mul.lo.s64 %rd46, %rd45, 3266489909; shr.u64 %rd47, %rd46, 16; xor.b64 %rd48, %rd47, %rd46; cvt.u64.u32 %rd49, %r3; add.s64 %rd3, %rd49, -1; and.b64 %rd60, %rd48, %rd3; shl.b64 %rd50, %rd60, 4; add.s64 %rd51, %rd1, %rd50; ld.global.u64 %rd5, [%rd51]; setp.eq.s64 %p8, %rd5, %rd2; @%p8 bra $L__BB12_6; setp.eq.s64 %p9, %rd5, -1; @%p9 bra $L__BB12_8; $L__BB12_4: add.s64 %rd52, %rd60, 1; and.b64 %rd60, %rd52, %rd3; shl.b64 %rd53, %rd60, 4; add.s64 %rd54, %rd1, %rd53; ld.global.u64 %rd8, [%rd54]; setp.eq.s64 %p10, %rd8, %rd2; @%p10 bra $L__BB12_6; setp.eq.s64 %p11, %rd8, -1; @%p11 bra $L__BB12_8; bra.uni $L__BB12_4; $L__BB12_6: shl.b64 %rd55, %rd60, 4; add.s64 %rd56, %rd1, %rd55; ld.global.u32 %r34, [%rd56+8]; mul.wide.u32 %rd57, %r34, 24; add.s64 %rd10, %rd13, %rd57; setp.eq.s64 %p12, %rd10, 0; @%p12 bra $L__BB12_8; add.s64 %rd59, %rd10, 12; mov.u32 %r35, 1; // begin inline asm cvta.to.global.u64 %rd58, %rd59;red.global.add.u32 [%rd58], %r35; // end inline asm $L__BB12_8: ret; } // .globl copy_particles_len_to_scan_value .visible .entry copy_particles_len_to_scan_value( .param .align 8 .b8 copy_particles_len_to_scan_value_param_0[72], .param .u64 copy_particles_len_to_scan_value_param_1 ) { .reg .pred %p<2>; .reg .f32 %f<2>; .reg .b32 %r<27>; .reg .b64 %rd<21>; ld.param.u64 %rd8, [copy_particles_len_to_scan_value_param_1]; ld.param.u64 %rd3, [copy_particles_len_to_scan_value_param_0+24]; ld.param.u64 %rd2, [copy_particles_len_to_scan_value_param_0+16]; cvta.to.global.u64 %rd9, %rd3; mov.u32 %r4, %ntid.z; mov.u32 %r5, %ntid.y; mov.u32 %r6, %ntid.x; mov.b64 %rd10, {%r6, %r5}; mov.u32 %r7, %ctaid.z; mov.u32 %r8, %nctaid.y; mov.u32 %r9, %ctaid.y; mad.lo.s32 %r10, %r7, %r8, %r9; mov.u32 %r11, %nctaid.x; mov.u32 %r12, %ctaid.x; mad.lo.s32 %r13, %r10, %r11, %r12; and.b64 %rd11, %rd10, 4294967295; cvt.u64.u32 %rd12, %r5; bfi.b64 %rd13, %rd12, %rd11, 32, 32; cvt.u64.u32 %rd14, %r4; mov.b64 {%r14, %r15}, %rd13; mov.b64 {%r16, %r17}, %rd14; mul.lo.s32 %r18, %r14, %r13; mul.lo.s32 %r19, %r18, %r15; mov.u32 %r20, %tid.z; mov.u32 %r21, %tid.y; mad.lo.s32 %r22, %r20, %r5, %r21; mov.u32 %r23, %tid.x; mad.lo.s32 %r24, %r22, %r6, %r23; mad.lo.s32 %r1, %r19, %r16, %r24; ld.global.u32 %r25, [%rd9]; setp.ge.u32 %p1, %r1, %r25; @%p1 bra $L__BB13_2; cvta.to.global.u64 %rd15, %rd8; mul.wide.u32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; cvta.to.global.u64 %rd18, %rd2; mul.wide.u32 %rd19, %r1, 24; add.s64 %rd20, %rd18, %rd19; ld.global.u32 %r26, [%rd20+12]; st.global.u32 [%rd17], %r26; $L__BB13_2: ret; } // .globl copy_scan_values_to_first_particles .visible .entry copy_scan_values_to_first_particles( .param .align 8 .b8 copy_scan_values_to_first_particles_param_0[72], .param .u64 copy_scan_values_to_first_particles_param_1 ) { .reg .pred %p<2>; .reg .f32 %f<2>; .reg .b32 %r<27>; .reg .b64 %rd<21>; ld.param.u64 %rd8, [copy_scan_values_to_first_particles_param_1]; ld.param.u64 %rd3, [copy_scan_values_to_first_particles_param_0+24]; ld.param.u64 %rd2, [copy_scan_values_to_first_particles_param_0+16]; cvta.to.global.u64 %rd9, %rd3; mov.u32 %r4, %ntid.z; mov.u32 %r5, %ntid.y; mov.u32 %r6, %ntid.x; mov.b64 %rd10, {%r6, %r5}; mov.u32 %r7, %ctaid.z; mov.u32 %r8, %nctaid.y; mov.u32 %r9, %ctaid.y; mad.lo.s32 %r10, %r7, %r8, %r9; mov.u32 %r11, %nctaid.x; mov.u32 %r12, %ctaid.x; mad.lo.s32 %r13, %r10, %r11, %r12; and.b64 %rd11, %rd10, 4294967295; cvt.u64.u32 %rd12, %r5; bfi.b64 %rd13, %rd12, %rd11, 32, 32; cvt.u64.u32 %rd14, %r4; mov.b64 {%r14, %r15}, %rd13; mov.b64 {%r16, %r17}, %rd14; mul.lo.s32 %r18, %r14, %r13; mul.lo.s32 %r19, %r18, %r15; mov.u32 %r20, %tid.z; mov.u32 %r21, %tid.y; mad.lo.s32 %r22, %r20, %r5, %r21; mov.u32 %r23, %tid.x; mad.lo.s32 %r24, %r22, %r6, %r23; mad.lo.s32 %r1, %r19, %r16, %r24; ld.global.u32 %r25, [%rd9]; setp.ge.u32 %p1, %r1, %r25; @%p1 bra $L__BB14_2; cvta.to.global.u64 %rd15, %rd8; cvta.to.global.u64 %rd16, %rd2; mul.wide.u32 %rd17, %r1, 24; add.s64 %rd18, %rd16, %rd17; mul.wide.u32 %rd19, %r1, 4; add.s64 %rd20, %rd15, %rd19; ld.global.u32 %r26, [%rd20]; st.global.u32 [%rd18+8], %r26; $L__BB14_2: ret; } // .globl finalize_particles_sort .visible .entry finalize_particles_sort( .param .u64 finalize_particles_sort_param_0, .param .u32 finalize_particles_sort_param_1, .param .align 8 .b8 finalize_particles_sort_param_2[72], .param .u64 finalize_particles_sort_param_3, .param .u64 finalize_particles_sort_param_4 ) { .reg .pred %p<12>; .reg .f32 %f<15>; .reg .b32 %r<59>; .reg .b64 %rd<71>; ld.param.u64 %rd10, [finalize_particles_sort_param_0]; ld.param.u32 %r4, [finalize_particles_sort_param_1]; ld.param.u64 %rd18, [finalize_particles_sort_param_3]; ld.param.u64 %rd19, [finalize_particles_sort_param_4]; ld.param.u32 %r3, [finalize_particles_sort_param_2+40]; ld.param.u64 %rd14, [finalize_particles_sort_param_2+32]; ld.param.f32 %f2, [finalize_particles_sort_param_2]; mov.u32 %r5, %ntid.z; mov.u32 %r6, %ntid.y; mov.u32 %r7, %ntid.x; mov.b64 %rd20, {%r7, %r6}; mov.u32 %r8, %ctaid.z; mov.u32 %r9, %nctaid.y; mov.u32 %r10, %ctaid.y; mad.lo.s32 %r11, %r8, %r9, %r10; mov.u32 %r12, %nctaid.x; mov.u32 %r13, %ctaid.x; mad.lo.s32 %r14, %r11, %r12, %r13; and.b64 %rd21, %rd20, 4294967295; cvt.u64.u32 %rd22, %r6; bfi.b64 %rd23, %rd22, %rd21, 32, 32; cvt.u64.u32 %rd24, %r5; mov.b64 {%r15, %r16}, %rd23; mov.b64 {%r17, %r18}, %rd24; mul.lo.s32 %r19, %r15, %r14; mul.lo.s32 %r20, %r19, %r16; mov.u32 %r21, %tid.z; mov.u32 %r22, %tid.y; mad.lo.s32 %r23, %r21, %r6, %r22; mov.u32 %r24, %tid.x; mad.lo.s32 %r25, %r23, %r7, %r24; mad.lo.s32 %r1, %r20, %r17, %r25; setp.ge.u32 %p1, %r1, %r4; @%p1 bra $L__BB15_7; cvta.to.global.u64 %rd25, %rd10; cvta.to.global.u64 %rd1, %rd14; mul.wide.u32 %rd26, %r1, 8; add.s64 %rd27, %rd25, %rd26; ld.global.u32 %rd28, [%rd27]; ld.global.u32 %rd29, [%rd27+4]; bfi.b64 %rd30, %rd29, %rd28, 32, 32; cvt.u32.u64 %r26, %rd30; mov.b32 %f3, %r26; div.rn.f32 %f4, %f3, %f2; shr.u64 %rd31, %rd30, 32; cvt.u32.u64 %r27, %rd31; mov.b32 %f5, %r27; div.rn.f32 %f6, %f5, %f2; mov.b32 %r28, %f4; and.b32 %r29, %r28, -2147483648; or.b32 %r30, %r29, 1056964608; mov.b32 %f7, %r30; add.rz.f32 %f8, %f4, %f7; cvt.rzi.f32.f32 %f9, %f8; setp.leu.f32 %p2, %f9, 0f5EFFFFFF; max.f32 %f10, %f9, 0fDF000000; cvt.rzi.s64.f32 %rd32, %f10; setp.num.f32 %p3, %f9, %f9; mov.b32 %r31, %f6; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r32, 1056964608; mov.b32 %f11, %r33; add.rz.f32 %f12, %f6, %f11; cvt.rzi.f32.f32 %f13, %f12; setp.leu.f32 %p4, %f13, 0f5EFFFFFF; max.f32 %f14, %f13, 0fDF000000; cvt.rzi.s64.f32 %rd33, %f14; setp.num.f32 %p5, %f13, %f13; add.s64 %rd34, %rd32, 8589934590; shr.u64 %rd35, %rd34, 2; shl.b64 %rd36, %rd33, 30; and.b64 %rd37, %rd35, 4294967295; and.pred %p6, %p3, %p2; selp.b64 %rd38, %rd37, 2147483647, %p6; add.s64 %rd39, %rd36, 9223372034707292160; and.b64 %rd40, %rd39, -4294967296; and.pred %p7, %p5, %p4; selp.b64 %rd41, %rd40, 9223372032559808512, %p7; or.b64 %rd2, %rd41, %rd38; shr.u64 %rd42, %rd2, 16; xor.b64 %rd43, %rd42, %rd2; mul.lo.s64 %rd44, %rd43, 2246822507; shr.u64 %rd45, %rd44, 13; xor.b64 %rd46, %rd45, %rd44; mul.lo.s64 %rd47, %rd46, 3266489909; shr.u64 %rd48, %rd47, 16; xor.b64 %rd49, %rd48, %rd47; cvt.u64.u32 %rd50, %r3; add.s64 %rd3, %rd50, -1; and.b64 %rd69, %rd49, %rd3; shl.b64 %rd51, %rd69, 4; add.s64 %rd52, %rd1, %rd51; ld.global.u64 %rd5, [%rd52]; setp.eq.s64 %p8, %rd5, %rd2; @%p8 bra $L__BB15_6; setp.eq.s64 %p9, %rd5, -1; @%p9 bra $L__BB15_7; $L__BB15_4: add.s64 %rd53, %rd69, 1; and.b64 %rd69, %rd53, %rd3; shl.b64 %rd54, %rd69, 4; add.s64 %rd55, %rd1, %rd54; ld.global.u64 %rd8, [%rd55]; setp.eq.s64 %p10, %rd8, %rd2; @%p10 bra $L__BB15_6; setp.eq.s64 %p11, %rd8, -1; @%p11 bra $L__BB15_7; bra.uni $L__BB15_4; $L__BB15_6: shl.b64 %rd58, %rd69, 4; add.s64 %rd59, %rd1, %rd58; ld.global.u32 %r36, [%rd59+8]; mul.wide.u32 %rd60, %r36, 4; add.s64 %rd57, %rd18, %rd60; mov.u32 %r35, 1; // begin inline asm cvta.to.global.u64 %rd56, %rd57;atom.global.add.u32 %r34, [%rd56], %r35; // end inline asm cvta.to.global.u64 %rd61, %rd19; mul.wide.u32 %rd62, %r34, 4; add.s64 %rd63, %rd61, %rd62; st.global.u32 [%rd63], %r1; $L__BB15_7: ret; } // .globl write_blocks_multiplicity_to_scan_value .visible .entry write_blocks_multiplicity_to_scan_value( .param .align 8 .b8 write_blocks_multiplicity_to_scan_value_param_0[72], .param .u64 write_blocks_multiplicity_to_scan_value_param_1, .param .u64 write_blocks_multiplicity_to_scan_value_param_2, .param .u32 write_blocks_multiplicity_to_scan_value_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<2>; .reg .b32 %r<36>; .reg .b64 %rd<24>; ld.param.u64 %rd8, [write_blocks_multiplicity_to_scan_value_param_1]; ld.param.u64 %rd9, [write_blocks_multiplicity_to_scan_value_param_2]; ld.param.u32 %r4, [write_blocks_multiplicity_to_scan_value_param_3]; ld.param.u64 %rd3, [write_blocks_multiplicity_to_scan_value_param_0+24]; ld.param.u64 %rd2, [write_blocks_multiplicity_to_scan_value_param_0+16]; cvta.to.global.u64 %rd10, %rd3; mov.u32 %r5, %ntid.z; mov.u32 %r6, %ntid.y; mov.u32 %r7, %ntid.x; mov.b64 %rd11, {%r7, %r6}; mov.u32 %r8, %ctaid.z; mov.u32 %r9, %nctaid.y; mov.u32 %r10, %ctaid.y; mad.lo.s32 %r11, %r8, %r9, %r10; mov.u32 %r12, %nctaid.x; mov.u32 %r13, %ctaid.x; mad.lo.s32 %r14, %r11, %r12, %r13; and.b64 %rd12, %rd11, 4294967295; cvt.u64.u32 %rd13, %r6; bfi.b64 %rd14, %rd13, %rd12, 32, 32; cvt.u64.u32 %rd15, %r5; mov.b64 {%r15, %r16}, %rd14; mov.b64 {%r17, %r18}, %rd15; mul.lo.s32 %r19, %r15, %r14; mul.lo.s32 %r20, %r19, %r16; mov.u32 %r21, %tid.z; mov.u32 %r22, %tid.y; mad.lo.s32 %r23, %r21, %r6, %r22; mov.u32 %r24, %tid.x; mad.lo.s32 %r25, %r23, %r7, %r24; mad.lo.s32 %r1, %r20, %r17, %r25; ld.global.u32 %r26, [%rd10]; setp.ge.u32 %p1, %r1, %r26; @%p1 bra $L__BB16_3; setp.eq.s32 %p2, %r4, 0; @%p2 bra $L__BB16_4; cvta.to.global.u64 %rd16, %rd2; mul.wide.u32 %rd17, %r1, 24; add.s64 %rd18, %rd16, %rd17; ld.global.u32 %r27, [%rd18+12]; div.u32 %r28, %r27, %r4; mul.lo.s32 %r29, %r28, %r4; setp.ne.s32 %p3, %r27, %r29; selp.u32 %r30, 1, 0, %p3; add.s32 %r31, %r28, %r30; ld.global.u32 %r32, [%rd18+16]; and.b32 %r33, %r32, 3; setp.eq.s32 %p4, %r33, 0; selp.b32 %r34, %r31, 0, %p4; selp.b32 %r35, 0, %r31, %p4; cvta.to.global.u64 %rd19, %rd8; mul.wide.u32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; st.global.u32 [%rd21], %r34; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd20; st.global.u32 [%rd23], %r35; $L__BB16_3: ret; $L__BB16_4: trap; } // .globl init_gpu_dispatch_blocks_mapping .visible .entry init_gpu_dispatch_blocks_mapping( .param .align 8 .b8 init_gpu_dispatch_blocks_mapping_param_0[72], .param .u64 init_gpu_dispatch_blocks_mapping_param_1, .param .u64 init_gpu_dispatch_blocks_mapping_param_2, .param .u32 init_gpu_dispatch_blocks_mapping_param_3 ) { .reg .pred %p<6>; .reg .b16 %rs<3>; .reg .f32 %f<2>; .reg .b32 %r<19>; .reg .b64 %rd<23>; ld.param.u64 %rd11, [init_gpu_dispatch_blocks_mapping_param_1]; ld.param.u64 %rd12, [init_gpu_dispatch_blocks_mapping_param_2]; ld.param.u32 %r11, [init_gpu_dispatch_blocks_mapping_param_3]; ld.param.u64 %rd9, [init_gpu_dispatch_blocks_mapping_param_0+56]; ld.param.u64 %rd8, [init_gpu_dispatch_blocks_mapping_param_0+48]; ld.param.u64 %rd5, [init_gpu_dispatch_blocks_mapping_param_0+16]; mov.u32 %r18, %tid.x; mov.u32 %r2, %ctaid.x; setp.eq.s32 %p1, %r11, 0; @%p1 bra $L__BB17_5; cvt.u64.u32 %rd1, %r2; cvta.to.global.u64 %rd13, %rd5; mul.wide.u32 %rd14, %r2, 24; add.s64 %rd15, %rd13, %rd14; add.s64 %rd2, %rd15, 16; ld.global.u32 %r12, [%rd15+12]; div.u32 %r13, %r12, %r11; mul.lo.s32 %r14, %r13, %r11; setp.ne.s32 %p2, %r12, %r14; selp.u32 %r15, 1, 0, %p2; add.s32 %r3, %r13, %r15; setp.ge.u32 %p3, %r18, %r3; @%p3 bra $L__BB17_4; ld.global.u32 %r4, [%rd2+-8]; ld.global.u8 %rs1, [%rd2]; and.b16 %rs2, %rs1, 3; setp.ne.s16 %p4, %rs2, 0; selp.b64 %rd16, %rd12, %rd11, %p4; cvta.to.global.u64 %rd17, %rd16; shl.b64 %rd18, %rd1, 2; add.s64 %rd19, %rd17, %rd18; ld.global.u32 %r5, [%rd19]; mov.u32 %r6, %ntid.x; selp.b64 %rd20, %rd9, %rd8, %p4; cvta.to.global.u64 %rd3, %rd20; $L__BB17_3: mad.lo.s32 %r16, %r18, %r11, %r4; add.s32 %r17, %r18, %r5; mul.wide.u32 %rd21, %r17, 8; add.s64 %rd22, %rd3, %rd21; st.global.u32 [%rd22], %r2; st.global.u32 [%rd22+4], %r16; add.s32 %r18, %r18, %r6; setp.lt.u32 %p5, %r18, %r3; @%p5 bra $L__BB17_3; $L__BB17_4: ret; $L__BB17_5: trap; } // .globl estimate_timestep_length .visible .entry estimate_timestep_length( .param .f32 estimate_timestep_length_param_0, .param .f32 estimate_timestep_length_param_1, .param .u64 estimate_timestep_length_param_2, .param .u64 estimate_timestep_length_param_3, .param .u64 estimate_timestep_length_param_4, .param .u64 estimate_timestep_length_param_5, .param .u64 estimate_timestep_length_param_6, .param .f32 estimate_timestep_length_param_7, .param .u64 estimate_timestep_length_param_8 ) { .reg .pred %p<38>; .reg .b16 %rs<3>; .reg .f32 %f<233>; .reg .b32 %r<50>; .reg .b64 %rd<38>; ld.param.f32 %f33, [estimate_timestep_length_param_1]; ld.param.u64 %rd9, [estimate_timestep_length_param_2]; ld.param.u64 %rd10, [estimate_timestep_length_param_3]; ld.param.u64 %rd11, [estimate_timestep_length_param_4]; ld.param.u64 %rd12, [estimate_timestep_length_param_6]; ld.param.f32 %f34, [estimate_timestep_length_param_7]; mov.u32 %r4, %ntid.z; mov.u32 %r5, %ntid.y; mov.u32 %r6, %ntid.x; mov.b64 %rd14, {%r6, %r5}; mov.u32 %r7, %ctaid.z; mov.u32 %r8, %nctaid.y; mov.u32 %r9, %ctaid.y; mad.lo.s32 %r10, %r7, %r8, %r9; mov.u32 %r11, %nctaid.x; mov.u32 %r12, %ctaid.x; mad.lo.s32 %r13, %r10, %r11, %r12; and.b64 %rd15, %rd14, 4294967295; cvt.u64.u32 %rd16, %r5; bfi.b64 %rd17, %rd16, %rd15, 32, 32; cvt.u64.u32 %rd18, %r4; mov.b64 {%r14, %r15}, %rd17; mov.b64 {%r16, %r17}, %rd18; mul.lo.s32 %r18, %r14, %r13; mul.lo.s32 %r19, %r18, %r15; mov.u32 %r20, %tid.z; mov.u32 %r21, %tid.y; mad.lo.s32 %r22, %r20, %r5, %r21; mov.u32 %r23, %tid.x; mad.lo.s32 %r24, %r22, %r6, %r23; mad.lo.s32 %r1, %r19, %r16, %r24; ld.param.u32 %r25, [estimate_timestep_length_param_5]; setp.ge.u32 %p2, %r1, %r25; @%p2 bra $L__BB18_27; cvt.u64.u32 %rd1, %r1; cvta.to.global.u64 %rd19, %rd9; mul.wide.u32 %rd20, %r1, 24; add.s64 %rd2, %rd19, %rd20; ld.global.u8 %rs1, [%rd2]; setp.ne.s16 %p3, %rs1, 0; @%p3 bra $L__BB18_27; ld.global.u64 %rd3, [%rd2+16]; cvta.to.global.u64 %rd21, %rd12; mul.lo.s64 %rd22, %rd3, 96; add.s64 %rd4, %rd21, %rd22; ld.global.u32 %r2, [%rd4]; setp.eq.s32 %p4, %r2, 3; @%p4 bra $L__BB18_25; bra.uni $L__BB18_3; $L__BB18_25: mov.f32 %f200, 0f7F7FFFFF; min.f32 %f232, %f33, %f200; bra.uni $L__BB18_26; $L__BB18_3: shl.b64 %rd23, %rd1, 5; shl.b64 %rd24, %rd1, 3; cvt.u16.u32 %rs2, %r2; cvta.to.global.u64 %rd25, %rd11; add.s64 %rd7, %rd25, %rd24; cvta.to.global.u64 %rd26, %rd10; add.s64 %rd8, %rd26, %rd23; setp.eq.s16 %p5, %rs2, 1; @%p5 bra $L__BB18_22; setp.eq.s16 %p6, %rs2, 2; @%p6 bra $L__BB18_7; setp.ne.s16 %p7, %rs2, 3; @%p7 bra $L__BB18_23; ld.global.f32 %f36, [%rd7]; ld.global.f32 %f37, [%rd7+4]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; add.f32 %f230, %f39, 0f00000000; mov.f32 %f231, 0f00000000; bra.uni $L__BB18_24; $L__BB18_22: ld.global.u64 %rd27, [%rd4+24]; shl.b64 %rd28, %rd1, 4; add.s64 %rd29, %rd27, %rd28; ld.f32 %f153, [%rd29+8]; ld.global.f32 %f154, [%rd8+4]; ld.global.f32 %f155, [%rd8]; div.rn.f32 %f156, %f155, %f154; ld.global.f32 %f157, [%rd4+16]; add.f32 %f158, %f157, %f157; div.rn.f32 %f159, %f158, 0f40400000; ld.global.f32 %f160, [%rd4+12]; add.f32 %f161, %f160, %f159; mul.f32 %f162, %f161, %f34; mul.f32 %f163, %f157, %f34; fma.rn.f32 %f164, %f163, 0f3FAAAAAB, %f162; div.rn.f32 %f165, %f164, %f156; sqrt.rn.f32 %f166, %f165; ld.global.f32 %f167, [%rd7]; ld.global.f32 %f168, [%rd7+4]; mul.f32 %f169, %f168, %f168; fma.rn.f32 %f170, %f167, %f167, %f169; add.f32 %f230, %f170, 0f00000000; sqrt.rn.f32 %f171, %f230; max.f32 %f172, %f171, %f166; ld.global.f32 %f173, [%rd4+8]; mul.f32 %f174, %f153, %f173; div.rn.f32 %f231, %f174, %f172; bra.uni $L__BB18_24; $L__BB18_7: ld.global.f32 %f43, [%rd8+4]; ld.global.f32 %f44, [%rd8]; div.rn.f32 %f2, %f44, %f43; ld.global.f32 %f3, [%rd8+12]; div.rn.f32 %f45, %f2, %f3; ld.global.f32 %f4, [%rd4+8]; div.rn.f32 %f5, %f45, %f2; ld.global.u32 %r3, [%rd4+12]; cvt.rn.f32.s32 %f6, %r3; abs.f32 %f8, %f5; setp.lt.f32 %p8, %f8, 0f00800000; mul.f32 %f50, %f8, 0f4B800000; selp.f32 %f51, %f50, %f8, %p8; selp.f32 %f52, 0fC3170000, 0fC2FE0000, %p8; mov.b32 %r26, %f51; and.b32 %r27, %r26, 8388607; or.b32 %r28, %r27, 1065353216; mov.b32 %f53, %r28; shr.u32 %r29, %r26, 23; cvt.rn.f32.u32 %f54, %r29; add.f32 %f55, %f52, %f54; setp.gt.f32 %p9, %f53, 0f3FB504F3; mul.f32 %f56, %f53, 0f3F000000; add.f32 %f57, %f55, 0f3F800000; selp.f32 %f58, %f57, %f55, %p9; selp.f32 %f59, %f56, %f53, %p9; add.f32 %f60, %f59, 0fBF800000; add.f32 %f41, %f59, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f40,%f41; // end inline asm add.f32 %f61, %f60, %f60; mul.f32 %f62, %f40, %f61; mul.f32 %f63, %f62, %f62; mov.f32 %f64, 0f3C4CAF63; mov.f32 %f65, 0f3B18F0FE; fma.rn.f32 %f66, %f65, %f63, %f64; mov.f32 %f67, 0f3DAAAABD; fma.rn.f32 %f68, %f66, %f63, %f67; mul.rn.f32 %f69, %f68, %f63; mul.rn.f32 %f70, %f69, %f62; sub.f32 %f71, %f60, %f62; add.f32 %f72, %f71, %f71; neg.f32 %f73, %f62; fma.rn.f32 %f74, %f73, %f60, %f72; mul.rn.f32 %f75, %f40, %f74; add.f32 %f76, %f70, %f62; sub.f32 %f77, %f62, %f76; add.f32 %f78, %f70, %f77; add.f32 %f79, %f75, %f78; add.f32 %f80, %f76, %f79; sub.f32 %f81, %f76, %f80; add.f32 %f82, %f79, %f81; mov.f32 %f83, 0f3F317200; mul.rn.f32 %f84, %f58, %f83; mov.f32 %f85, 0f35BFBE8E; mul.rn.f32 %f86, %f58, %f85; add.f32 %f87, %f84, %f80; sub.f32 %f88, %f84, %f87; add.f32 %f89, %f80, %f88; add.f32 %f90, %f82, %f89; add.f32 %f91, %f86, %f90; add.f32 %f92, %f87, %f91; sub.f32 %f93, %f87, %f92; add.f32 %f94, %f91, %f93; abs.f32 %f9, %f6; setp.gt.f32 %p10, %f9, 0f77F684DF; mul.f32 %f95, %f6, 0f39000000; selp.f32 %f96, %f95, %f6, %p10; mul.rn.f32 %f97, %f96, %f92; neg.f32 %f98, %f97; fma.rn.f32 %f99, %f96, %f92, %f98; fma.rn.f32 %f100, %f96, %f94, %f99; mov.f32 %f101, 0f00000000; fma.rn.f32 %f102, %f101, %f92, %f100; add.rn.f32 %f103, %f97, %f102; neg.f32 %f104, %f103; add.rn.f32 %f105, %f97, %f104; add.rn.f32 %f106, %f105, %f102; mov.b32 %r30, %f103; setp.eq.s32 %p11, %r30, 1118925336; add.s32 %r31, %r30, -1; mov.b32 %f107, %r31; add.f32 %f108, %f106, 0f37000000; selp.f32 %f10, %f108, %f106, %p11; selp.f32 %f109, %f107, %f103, %p11; mov.f32 %f110, 0f3FB8AA3B; mul.rn.f32 %f111, %f109, %f110; cvt.rzi.f32.f32 %f112, %f111; abs.f32 %f113, %f112; setp.gt.f32 %p12, %f113, 0f42FC0000; mov.b32 %r32, %f112; and.b32 %r33, %r32, -2147483648; or.b32 %r34, %r33, 1123811328; mov.b32 %f114, %r34; selp.f32 %f115, %f114, %f112, %p12; mov.f32 %f116, 0fBF317218; fma.rn.f32 %f117, %f115, %f116, %f109; mov.f32 %f118, 0f3102E308; fma.rn.f32 %f119, %f115, %f118, %f117; mul.f32 %f120, %f119, 0f3FB8AA3B; add.f32 %f121, %f115, 0f4B40007F; mov.b32 %r35, %f121; shl.b32 %r36, %r35, 23; mov.b32 %f122, %r36; ex2.approx.ftz.f32 %f123, %f120; mul.f32 %f11, %f123, %f122; setp.eq.f32 %p13, %f11, 0f7F800000; mov.f32 %f227, 0f7F800000; @%p13 bra $L__BB18_9; fma.rn.f32 %f227, %f11, %f10, %f11; $L__BB18_9: cvt.rn.f32.s32 %f226, %r3; mul.f32 %f225, %f226, 0f3F000000; cvt.rzi.f32.f32 %f224, %f225; add.f32 %f223, %f224, %f224; sub.f32 %f222, %f226, %f223; abs.f32 %f221, %f222; setp.lt.f32 %p14, %f5, 0f00000000; setp.eq.f32 %p15, %f221, 0f3F800000; and.pred %p1, %p14, %p15; setp.eq.f32 %p16, %f5, 0f00000000; @%p16 bra $L__BB18_13; bra.uni $L__BB18_10; $L__BB18_13: add.f32 %f127, %f5, %f5; mov.b32 %r39, %f127; selp.b32 %r40, %r39, 0, %p15; or.b32 %r41, %r40, 2139095040; setp.lt.s32 %p20, %r3, 0; selp.b32 %r42, %r41, %r40, %p20; mov.b32 %f229, %r42; bra.uni $L__BB18_14; $L__BB18_23: ld.global.u64 %rd30, [%rd4+24]; shl.b64 %rd31, %rd1, 4; add.s64 %rd32, %rd30, %rd31; ld.f32 %f175, [%rd32+8]; ld.global.f32 %f176, [%rd8+4]; ld.global.f32 %f177, [%rd8]; div.rn.f32 %f178, %f177, %f176; ld.global.f32 %f179, [%rd4+20]; add.f32 %f180, %f179, %f179; div.rn.f32 %f181, %f180, 0f40400000; ld.global.f32 %f182, [%rd4+16]; add.f32 %f183, %f182, %f181; mul.f32 %f184, %f175, %f183; mul.f32 %f185, %f175, %f179; fma.rn.f32 %f186, %f185, 0f3FAAAAAB, %f184; div.rn.f32 %f187, %f186, %f178; sqrt.rn.f32 %f188, %f187; ld.global.f32 %f189, [%rd7]; ld.global.f32 %f190, [%rd7+4]; mul.f32 %f191, %f190, %f190; fma.rn.f32 %f192, %f189, %f189, %f191; add.f32 %f230, %f192, 0f00000000; sqrt.rn.f32 %f193, %f230; max.f32 %f194, %f193, %f188; ld.global.f32 %f195, [%rd4+12]; mul.f32 %f196, %f195, %f34; div.rn.f32 %f231, %f196, %f194; bra.uni $L__BB18_24; $L__BB18_10: mov.b32 %r37, %f227; xor.b32 %r38, %r37, -2147483648; mov.b32 %f124, %r38; selp.f32 %f229, %f124, %f227, %p1; setp.geu.f32 %p17, %f5, 0f00000000; @%p17 bra $L__BB18_14; cvt.rn.f32.s32 %f220, %r3; cvt.rzi.f32.f32 %f125, %f220; setp.eq.f32 %p18, %f125, %f220; @%p18 bra $L__BB18_14; mov.f32 %f229, 0f7FFFFFFF; $L__BB18_14: cvt.rn.f32.s32 %f206, %r3; abs.f32 %f205, %f206; abs.f32 %f204, %f5; add.f32 %f128, %f204, %f205; mov.b32 %r43, %f128; setp.lt.s32 %p21, %r43, 2139095040; @%p21 bra $L__BB18_21; cvt.rn.f32.s32 %f214, %r3; abs.f32 %f213, %f214; abs.f32 %f212, %f5; setp.gtu.f32 %p22, %f212, 0f7F800000; setp.gtu.f32 %p23, %f213, 0f7F800000; or.pred %p24, %p22, %p23; @%p24 bra $L__BB18_20; bra.uni $L__BB18_16; $L__BB18_20: cvt.rn.f32.s32 %f219, %r3; add.f32 %f229, %f5, %f219; bra.uni $L__BB18_21; $L__BB18_16: cvt.rn.f32.s32 %f216, %r3; abs.f32 %f215, %f216; setp.eq.f32 %p25, %f215, 0f7F800000; @%p25 bra $L__BB18_19; bra.uni $L__BB18_17; $L__BB18_19: abs.f32 %f218, %f5; setp.gt.f32 %p28, %f218, 0f3F800000; selp.b32 %r47, 2139095040, 0, %p28; xor.b32 %r48, %r47, 2139095040; setp.lt.s32 %p29, %r3, 0; selp.b32 %r49, %r48, %r47, %p29; mov.b32 %f129, %r49; setp.eq.f32 %p30, %f5, 0fBF800000; selp.f32 %f229, 0f3F800000, %f129, %p30; bra.uni $L__BB18_21; $L__BB18_17: abs.f32 %f217, %f5; setp.neu.f32 %p26, %f217, 0f7F800000; @%p26 bra $L__BB18_21; setp.gt.s32 %p27, %r3, -1; selp.b32 %r44, 2139095040, 0, %p27; or.b32 %r45, %r44, -2147483648; selp.b32 %r46, %r45, %r44, %p1; mov.b32 %f229, %r46; $L__BB18_21: ld.param.f32 %f207, [estimate_timestep_length_param_7]; setp.eq.s32 %p31, %r3, 0; setp.eq.f32 %p32, %f5, 0f3F800000; mov.f32 %f130, 0f3F800000; or.pred %p33, %p32, %p31; add.f32 %f131, %f229, 0fBF800000; selp.f32 %f132, 0f00000000, %f131, %p33; mul.f32 %f133, %f4, %f132; ld.global.f32 %f134, [%rd4+20]; neg.f32 %f135, %f134; max.f32 %f136, %f133, %f135; add.f32 %f137, %f3, 0fBF800000; mul.f32 %f138, %f137, %f2; mul.f32 %f139, %f136, 0fC0C00000; fma.rn.f32 %f140, %f136, 0fC0C00000, %f139; div.rn.f32 %f141, %f138, %f140; sqrt.rn.f32 %f142, %f141; div.rn.f32 %f143, %f207, %f3; mul.f32 %f144, %f143, %f142; ld.global.f32 %f145, [%rd7]; ld.global.f32 %f146, [%rd7+4]; mul.f32 %f147, %f146, %f146; fma.rn.f32 %f148, %f145, %f145, %f147; add.f32 %f230, %f148, 0f00000000; max.f32 %f149, %f230, %f130; div.rn.f32 %f150, %f149, 0f3DCCCCCD; sqrt.rn.f32 %f151, %f150; div.rn.f32 %f152, %f207, %f151; min.f32 %f231, %f144, %f152; $L__BB18_24: ld.param.f32 %f209, [estimate_timestep_length_param_1]; ld.param.f32 %f208, [estimate_timestep_length_param_7]; sqrt.rn.f32 %f197, %f230; div.rn.f32 %f198, %f208, %f197; min.f32 %f199, %f209, %f231; min.f32 %f232, %f199, %f198; $L__BB18_26: ld.param.u64 %rd37, [estimate_timestep_length_param_8]; ld.param.f32 %f211, [estimate_timestep_length_param_0]; ld.param.f32 %f210, [estimate_timestep_length_param_1]; setp.gt.f32 %p34, %f210, %f211; setp.lt.f32 %p35, %f232, %f211; and.pred %p36, %p34, %p35; selp.f32 %f201, %f211, %f232, %p36; mul.f32 %f202, %f201, 0f5368D4A5; setp.gt.f32 %p37, %f202, 0f5F7FFFFF; max.f32 %f203, %f202, 0f00000000; cvt.rzi.u64.f32 %rd36, %f203; selp.b64 %rd35, -1, %rd36, %p37; // begin inline asm cvta.to.global.u64 %rd33, %rd37;red.global.min.u64 [%rd33], %rd35; // end inline asm $L__BB18_27: ret; } .func _ZN4core6result13unwrap_failed17ha84c04ab95b75c50E() .noreturn { trap; }