// // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-31833905 // Cuda compilation tools, release 11.8, V11.8.89 // Based on NVVM 7.0.1 // .version 7.8 .target sm_70 .address_size 64 // .globl g2p2g .func _ZN4core6result13unwrap_failed17h02aadeb87602f26eE () .noreturn ; // _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E has been demoted // _ZN20sparkl2d_kernels_ptx4cuda10prefix_sum14prefix_sum_51212shared_array6SHARED17hd8a2402217f77aa1E has been demoted .global .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162}; .visible .entry g2p2g( .param .f32 g2p2g_param_0, .param .u64 g2p2g_param_1, .param .u64 g2p2g_param_2, .param .u64 g2p2g_param_3, .param .u64 g2p2g_param_4, .param .u64 g2p2g_param_5, .param .u64 g2p2g_param_6, .param .u64 g2p2g_param_7, .param .u64 g2p2g_param_8, .param .u64 g2p2g_param_9, .param .align 8 .b8 g2p2g_param_10[72], .param .align 8 .b8 g2p2g_param_11[72], .param .u32 g2p2g_param_12, .param .u8 g2p2g_param_13 ) { .local .align 16 .b8 __local_depot0[112]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<724>; .reg .b16 %rs<38>; .reg .f32 %f<4948>; .reg .b32 %r<1707>; .reg .f64 %fd<27>; .reg .b64 %rd<1205>; // demoted variable .shared .align 8 .b8 _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E[4096]; mov.u64 %SPL, __local_depot0; cvta.local.u64 %SP, %SPL; ld.param.f32 %f846, [g2p2g_param_0]; ld.param.u64 %rd330, [g2p2g_param_3]; ld.param.u64 %rd331, [g2p2g_param_4]; ld.param.u64 %rd332, [g2p2g_param_5]; ld.param.u64 %rd333, [g2p2g_param_6]; ld.param.u64 %rd334, [g2p2g_param_7]; ld.param.u64 %rd335, [g2p2g_param_8]; ld.param.u8 %r303, [g2p2g_param_13]; ld.param.u8 %r304, [g2p2g_param_13+1]; prmt.b32 %r305, %r304, %r303, 30212; and.b32 %r306, %r305, 1; setp.eq.b32 %p3, %r306, 1; ld.param.u64 %rd350, [g2p2g_param_11+64]; ld.param.u64 %rd349, [g2p2g_param_11+56]; ld.param.u64 %rd348, [g2p2g_param_11+48]; ld.param.u64 %rd347, [g2p2g_param_11+32]; ld.param.u64 %rd345, [g2p2g_param_11+16]; ld.param.u64 %rd344, [g2p2g_param_11+8]; ld.param.f32 %f848, [g2p2g_param_11]; ld.param.u64 %rd343, [g2p2g_param_10+64]; ld.param.u32 %r300, [g2p2g_param_10+40]; ld.param.u64 %rd340, [g2p2g_param_10+32]; ld.param.u64 %rd337, [g2p2g_param_10+8]; add.u64 %rd1, %SPL, 64; add.u64 %rd4, %SPL, 48; add.u64 %rd5, %SPL, 64; add.u64 %rd6, %SPL, 96; cvta.to.global.u64 %rd8, %rd337; cvta.to.global.u64 %rd9, %rd340; cvta.to.global.u64 %rd11, %rd347; mov.u32 %r1, %tid.x; mov.u32 %r2, %ntid.x; setp.eq.s32 %p4, %r2, 0; @%p4 bra $L__BB0_536; mov.u32 %r307, %ctaid.x; selp.b64 %rd357, %rd349, %rd348, %p3; cvta.to.global.u64 %rd358, %rd357; mul.wide.u32 %rd359, %r307, 8; add.s64 %rd12, %rd358, %rd359; mov.u32 %r308, 64; div.u32 %r309, %r308, %r2; cvt.u64.u32 %rd13, %r309; mul.wide.u32 %rd14, %r309, %r1; setp.gt.u64 %p5, %rd14, 127; @%p5 bra $L__BB0_535; ld.global.u32 %r3, [%rd12+4]; ld.global.u32 %r310, [%rd12]; cvta.to.global.u64 %rd360, %rd345; mul.wide.u32 %rd361, %r310, 24; add.s64 %rd362, %rd360, %rd361; ld.global.u64 %rd363, [%rd362]; ld.global.v2.u32 {%r311, %r312}, [%rd362+8]; shr.u64 %rd364, %rd14, 4; and.b64 %rd15, %rd364, 1; shr.u64 %rd365, %rd14, 5; and.b64 %rd16, %rd365, 1; add.s64 %rd366, %rd15, %rd363; and.b64 %rd367, %rd366, 4294967295; shl.b64 %rd368, %rd14, 27; and.b64 %rd369, %rd368, 4294967296; add.s64 %rd370, %rd369, %rd363; and.b64 %rd371, %rd370, -4294967296; or.b64 %rd17, %rd371, %rd367; shr.u64 %rd372, %rd17, 16; xor.b64 %rd373, %rd372, %rd17; mul.lo.s64 %rd374, %rd373, 2246822507; shr.u64 %rd375, %rd374, 13; xor.b64 %rd376, %rd375, %rd374; mul.lo.s64 %rd377, %rd376, 3266489909; shr.u64 %rd378, %rd377, 16; xor.b64 %rd18, %rd378, %rd377; cvt.u64.u32 %rd379, %r300; add.s64 %rd19, %rd379, -1; and.b64 %rd1116, %rd18, %rd19; shl.b64 %rd380, %rd1116, 4; add.s64 %rd381, %rd9, %rd380; ld.global.u64 %rd21, [%rd381]; setp.eq.s64 %p6, %rd21, %rd17; @%p6 bra $L__BB0_16; bra.uni $L__BB0_3; $L__BB0_16: setp.gt.u32 %p17, %r2, 64; @%p17 bra $L__BB0_31; bra.uni $L__BB0_17; $L__BB0_3: setp.eq.s64 %p7, %rd21, -1; @%p7 bra $L__BB0_9; $L__BB0_5: add.s64 %rd382, %rd1116, 1; and.b64 %rd1116, %rd382, %rd19; shl.b64 %rd383, %rd1116, 4; add.s64 %rd384, %rd9, %rd383; ld.global.u64 %rd24, [%rd384]; setp.eq.s64 %p8, %rd24, %rd17; @%p8 bra $L__BB0_8; setp.ne.s64 %p9, %rd24, -1; @%p9 bra $L__BB0_5; setp.lt.u32 %p10, %r2, 65; @%p10 bra $L__BB0_10; bra.uni $L__BB0_31; $L__BB0_9: setp.gt.u32 %p12, %r2, 64; @%p12 bra $L__BB0_31; $L__BB0_10: and.b64 %rd25, %rd14, 15; add.s64 %rd26, %rd25, %rd13; shl.b64 %rd27, %rd15, 2; shl.b64 %rd28, %rd16, 2; add.s64 %rd385, %rd25, 1; max.u64 %rd29, %rd385, %rd26; sub.s64 %rd386, %rd29, %rd14; and.b64 %rd1118, %rd386, 3; setp.eq.s64 %p13, %rd1118, 0; mov.u64 %rd1124, %rd25; @%p13 bra $L__BB0_13; mov.u64 %rd1117, %rd25; $L__BB0_12: .pragma "nounroll"; add.s64 %rd1124, %rd1117, 1; shr.u64 %rd387, %rd1117, 2; and.b64 %rd388, %rd387, 3; and.b64 %rd389, %rd1117, 3; or.b64 %rd390, %rd389, %rd27; or.b64 %rd391, %rd388, %rd28; shl.b64 %rd392, %rd391, 3; or.b64 %rd393, %rd390, %rd392; shl.b64 %rd394, %rd393, 6; mov.u64 %rd395, _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E; mov.u64 %rd396, 0; add.s64 %rd397, %rd395, %rd394; st.shared.u64 [%rd397+32], %rd396; mov.f32 %f849, 0f00000000; st.shared.v2.f32 [%rd397+40], {%f849, %f849}; st.shared.v2.f32 [%rd397+24], {%f849, %f849}; st.shared.v2.f32 [%rd397+16], {%f849, %f849}; st.shared.u32 [%rd397+56], %rd396; st.shared.u64 [%rd397+48], %rd396; st.shared.u64 [%rd397], %rd396; mov.u32 %r313, -1; st.shared.u32 [%rd397+60], %r313; add.s64 %rd1118, %rd1118, -1; setp.ne.s64 %p14, %rd1118, 0; mov.u64 %rd1117, %rd1124; @%p14 bra $L__BB0_12; $L__BB0_13: not.b64 %rd398, %rd25; add.s64 %rd399, %rd29, %rd398; setp.lt.u64 %p15, %rd399, 3; @%p15 bra $L__BB0_31; add.s64 %rd400, %rd1124, 3; and.b64 %rd401, %rd400, 3; add.s64 %rd402, %rd1124, 1; and.b64 %rd403, %rd402, 3; and.b64 %rd404, %rd1124, 3; or.b64 %rd36, %rd404, %rd27; or.b64 %rd37, %rd403, %rd27; or.b64 %rd38, %rd401, %rd27; shr.u64 %rd1123, %rd400, 2; add.s64 %rd405, %rd1124, 2; shr.u64 %rd1122, %rd405, 2; shr.u64 %rd1121, %rd1124, 2; shr.u64 %rd1120, %rd402, 2; $L__BB0_15: and.b64 %rd406, %rd1121, 3; or.b64 %rd407, %rd406, %rd28; shl.b64 %rd408, %rd407, 3; or.b64 %rd409, %rd36, %rd408; shl.b64 %rd410, %rd409, 6; mov.u64 %rd411, _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E; mov.u64 %rd412, 0; add.s64 %rd413, %rd411, %rd410; st.shared.u64 [%rd413+32], %rd412; mov.f32 %f850, 0f00000000; st.shared.v2.f32 [%rd413+40], {%f850, %f850}; st.shared.v2.f32 [%rd413+24], {%f850, %f850}; st.shared.v2.f32 [%rd413+16], {%f850, %f850}; st.shared.u32 [%rd413+56], %rd412; st.shared.u64 [%rd413+48], %rd412; st.shared.u64 [%rd413], %rd412; mov.u32 %r314, -1; st.shared.u32 [%rd413+60], %r314; and.b64 %rd414, %rd1120, 3; or.b64 %rd415, %rd414, %rd28; shl.b64 %rd416, %rd415, 3; or.b64 %rd417, %rd37, %rd416; shl.b64 %rd418, %rd417, 6; add.s64 %rd419, %rd411, %rd418; st.shared.u64 [%rd419+32], %rd412; st.shared.v2.f32 [%rd419+40], {%f850, %f850}; st.shared.v2.f32 [%rd419+24], {%f850, %f850}; st.shared.v2.f32 [%rd419+16], {%f850, %f850}; st.shared.u32 [%rd419+56], %rd412; st.shared.u64 [%rd419+48], %rd412; st.shared.u64 [%rd419], %rd412; st.shared.u32 [%rd419+60], %r314; and.b64 %rd420, %rd1122, 3; or.b64 %rd421, %rd420, %rd28; shl.b64 %rd422, %rd421, 3; or.b64 %rd423, %rd36, %rd422; shl.b64 %rd424, %rd423, 6; xor.b64 %rd425, %rd424, 128; add.s64 %rd426, %rd411, %rd425; st.shared.u64 [%rd426+32], %rd412; st.shared.v2.f32 [%rd426+40], {%f850, %f850}; st.shared.v2.f32 [%rd426+24], {%f850, %f850}; st.shared.v2.f32 [%rd426+16], {%f850, %f850}; st.shared.u32 [%rd426+56], %rd412; st.shared.u64 [%rd426+48], %rd412; st.shared.u64 [%rd426], %rd412; st.shared.u32 [%rd426+60], %r314; and.b64 %rd427, %rd1123, 3; or.b64 %rd428, %rd427, %rd28; shl.b64 %rd429, %rd428, 3; or.b64 %rd430, %rd38, %rd429; shl.b64 %rd431, %rd430, 6; add.s64 %rd432, %rd411, %rd431; st.shared.u64 [%rd432+32], %rd412; st.shared.v2.f32 [%rd432+40], {%f850, %f850}; st.shared.v2.f32 [%rd432+24], {%f850, %f850}; st.shared.v2.f32 [%rd432+16], {%f850, %f850}; st.shared.u64 [%rd432+48], %rd412; st.shared.u32 [%rd432+56], %rd412; st.shared.u64 [%rd432], %rd412; st.shared.u32 [%rd432+60], %r314; add.s64 %rd1123, %rd1123, 1; add.s64 %rd1122, %rd1122, 1; add.s64 %rd1121, %rd1121, 1; add.s64 %rd1120, %rd1120, 1; add.s64 %rd1124, %rd1124, 4; setp.lt.u64 %p16, %rd1124, %rd26; @%p16 bra $L__BB0_15; bra.uni $L__BB0_31; $L__BB0_8: setp.lt.u32 %p11, %r2, 65; @%p11 bra $L__BB0_17; bra.uni $L__BB0_31; $L__BB0_17: and.b64 %rd1127, %rd14, 15; add.s64 %rd56, %rd1127, %rd13; shl.b64 %rd433, %rd1116, 4; add.s64 %rd434, %rd9, %rd433; shl.b64 %rd57, %rd15, 2; shl.b64 %rd58, %rd16, 2; ld.global.u32 %r315, [%rd434+8]; mul.wide.u32 %rd59, %r315, 16; add.s64 %rd435, %rd1127, 1; max.u64 %rd436, %rd435, %rd56; sub.s64 %rd437, %rd436, %rd14; and.b64 %rd438, %rd437, 1; setp.eq.b64 %p18, %rd438, 1; mov.pred %p19, 0; xor.pred %p20, %p18, %p19; not.pred %p21, %p20; @%p21 bra $L__BB0_22; and.b64 %rd439, %rd14, 3; shr.u64 %rd440, %rd14, 2; and.b64 %rd441, %rd440, 3; or.b64 %rd442, %rd439, %rd57; or.b64 %rd443, %rd441, %rd58; shl.b64 %rd444, %rd443, 3; or.b64 %rd445, %rd442, %rd444; or.b64 %rd446, %rd439, %rd59; and.b64 %rd447, %rd14, 12; or.b64 %rd60, %rd446, %rd447; setp.gt.u64 %p22, %rd343, %rd60; shl.b64 %rd448, %rd445, 6; mov.u64 %rd449, _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E; add.s64 %rd61, %rd449, %rd448; @%p22 bra $L__BB0_20; bra.uni $L__BB0_19; $L__BB0_20: mul.lo.s64 %rd451, %rd60, 56; add.s64 %rd452, %rd8, %rd451; ld.global.u32 %rd453, [%rd452+4]; ld.global.u32 %rd454, [%rd452+8]; bfi.b64 %rd455, %rd454, %rd453, 32, 32; st.shared.u64 [%rd61+32], %rd455; ld.global.u32 %r317, [%rd452+12]; st.shared.u32 [%rd61+48], %r317; ld.global.u64 %rd456, [%rd452+48]; st.shared.u32 [%rd61+52], %rd456; shr.u64 %rd457, %rd456, 32; st.shared.u32 [%rd61+56], %rd457; ld.global.u64 %rd458, [%rd452+24]; ld.global.u64 %rd459, [%rd452+32]; st.shared.u64 [%rd61], %rd458; st.shared.u64 [%rd61+8], %rd459; ld.global.u32 %r318, [%rd452+20]; st.shared.u32 [%rd61+16], %r318; bra.uni $L__BB0_21; $L__BB0_19: mov.u64 %rd450, 0; st.shared.u64 [%rd61+32], %rd450; mov.u32 %r316, 0; st.shared.u32 [%rd61+56], %rd450; st.shared.u64 [%rd61+48], %rd450; st.shared.u64 [%rd61], %rd450; st.shared.u32 [%rd61+16], %r316; $L__BB0_21: mov.u32 %r319, 0; mov.u64 %rd460, 0; mov.f32 %f851, 0f00000000; st.shared.v2.f32 [%rd61+40], {%f851, %f851}; st.shared.u64 [%rd61+24], %rd460; st.shared.u32 [%rd61+20], %r319; mov.u32 %r320, -1; st.shared.u32 [%rd61+60], %r320; mov.u64 %rd1127, %rd435; $L__BB0_22: setp.ge.u64 %p23, %rd435, %rd56; @%p23 bra $L__BB0_31; mov.u64 %rd472, _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E; $L__BB0_24: shr.u64 %rd462, %rd1127, 2; and.b64 %rd463, %rd462, 3; and.b64 %rd464, %rd1127, 3; or.b64 %rd465, %rd464, %rd57; or.b64 %rd466, %rd463, %rd58; shl.b64 %rd467, %rd466, 3; or.b64 %rd468, %rd465, %rd467; or.b64 %rd469, %rd464, %rd59; and.b64 %rd470, %rd1127, 12; or.b64 %rd65, %rd469, %rd470; setp.gt.u64 %p24, %rd343, %rd65; shl.b64 %rd471, %rd468, 6; add.s64 %rd66, %rd472, %rd471; @%p24 bra $L__BB0_26; bra.uni $L__BB0_25; $L__BB0_26: mul.lo.s64 %rd474, %rd65, 56; add.s64 %rd475, %rd8, %rd474; ld.global.u32 %rd476, [%rd475+4]; ld.global.u32 %rd477, [%rd475+8]; bfi.b64 %rd478, %rd477, %rd476, 32, 32; st.shared.u64 [%rd66+32], %rd478; ld.global.u32 %r322, [%rd475+12]; st.shared.u32 [%rd66+48], %r322; ld.global.u64 %rd479, [%rd475+48]; st.shared.u32 [%rd66+52], %rd479; shr.u64 %rd480, %rd479, 32; st.shared.u32 [%rd66+56], %rd480; ld.global.u64 %rd481, [%rd475+24]; ld.global.u64 %rd482, [%rd475+32]; st.shared.u64 [%rd66], %rd481; st.shared.u64 [%rd66+8], %rd482; ld.global.u32 %r323, [%rd475+20]; st.shared.u32 [%rd66+16], %r323; bra.uni $L__BB0_27; $L__BB0_25: mov.u64 %rd473, 0; st.shared.u64 [%rd66+32], %rd473; mov.u32 %r321, 0; st.shared.u32 [%rd66+56], %rd473; st.shared.u64 [%rd66+48], %rd473; st.shared.u64 [%rd66], %rd473; st.shared.u32 [%rd66+16], %r321; $L__BB0_27: mov.u32 %r324, 0; mov.u64 %rd483, 0; mov.f32 %f852, 0f00000000; st.shared.v2.f32 [%rd66+40], {%f852, %f852}; st.shared.u64 [%rd66+24], %rd483; st.shared.u32 [%rd66+20], %r324; mov.u32 %r325, -1; st.shared.u32 [%rd66+60], %r325; add.s64 %rd67, %rd1127, 2; add.s64 %rd484, %rd1127, 1; and.b64 %rd485, %rd484, 3; shr.u64 %rd486, %rd484, 2; and.b64 %rd487, %rd486, 3; or.b64 %rd488, %rd485, %rd57; or.b64 %rd489, %rd487, %rd58; shl.b64 %rd490, %rd489, 3; or.b64 %rd491, %rd488, %rd490; or.b64 %rd492, %rd485, %rd59; and.b64 %rd493, %rd484, 12; or.b64 %rd68, %rd492, %rd493; setp.gt.u64 %p25, %rd343, %rd68; shl.b64 %rd494, %rd491, 6; add.s64 %rd69, %rd472, %rd494; @%p25 bra $L__BB0_29; bra.uni $L__BB0_28; $L__BB0_29: mul.lo.s64 %rd497, %rd68, 56; add.s64 %rd498, %rd8, %rd497; ld.global.u32 %rd499, [%rd498+4]; ld.global.u32 %rd500, [%rd498+8]; bfi.b64 %rd501, %rd500, %rd499, 32, 32; st.shared.u64 [%rd69+32], %rd501; ld.global.u32 %r327, [%rd498+12]; st.shared.u32 [%rd69+48], %r327; ld.global.u64 %rd502, [%rd498+48]; st.shared.u32 [%rd69+52], %rd502; shr.u64 %rd503, %rd502, 32; st.shared.u32 [%rd69+56], %rd503; ld.global.u64 %rd504, [%rd498+24]; ld.global.u64 %rd505, [%rd498+32]; st.shared.u64 [%rd69], %rd504; st.shared.u64 [%rd69+8], %rd505; ld.global.u32 %r328, [%rd498+20]; st.shared.u32 [%rd69+16], %r328; bra.uni $L__BB0_30; $L__BB0_28: st.shared.u64 [%rd69+32], %rd483; st.shared.u32 [%rd69+56], %rd483; st.shared.u64 [%rd69+48], %rd483; st.shared.u64 [%rd69], %rd483; st.shared.u32 [%rd69+16], %r324; $L__BB0_30: mov.u32 %r329, 0; mov.u64 %rd506, 0; mov.f32 %f853, 0f00000000; st.shared.v2.f32 [%rd69+40], {%f853, %f853}; st.shared.u64 [%rd69+24], %rd506; st.shared.u32 [%rd69+20], %r329; mov.u32 %r330, -1; st.shared.u32 [%rd69+60], %r330; setp.lt.u64 %p26, %rd67, %rd56; mov.u64 %rd1127, %rd67; @%p26 bra $L__BB0_24; $L__BB0_31: bar.sync 0; add.s32 %r331, %r312, %r311; add.s32 %r6, %r3, %r1; setp.ge.u32 %p27, %r6, %r331; @%p27 bra $L__BB0_511; cvta.to.global.u64 %rd507, %rd335; mul.wide.u32 %rd508, %r6, 4; add.s64 %rd509, %rd507, %rd508; ld.global.u32 %r332, [%rd509]; cvt.u64.u32 %rd70, %r332; cvta.to.global.u64 %rd510, %rd330; mul.wide.u32 %rd511, %r332, 24; add.s64 %rd71, %rd510, %rd511; ld.global.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [%rd71]; ld.global.u32 %rd512, [%rd71+4]; ld.global.u32 %rd513, [%rd71+8]; bfi.b64 %rd72, %rd513, %rd512, 32, 32; ld.global.u32 %r7, [%rd71+12]; ld.global.u64 %rd73, [%rd71+16]; cvta.to.global.u64 %rd514, %rd331; mul.wide.u32 %rd515, %r332, 8; add.s64 %rd74, %rd514, %rd515; ld.global.f32 %f2, [%rd74]; ld.global.f32 %f3, [%rd74+4]; cvta.to.global.u64 %rd516, %rd332; add.s64 %rd75, %rd516, %rd515; ld.global.u32 %rd517, [%rd75]; ld.global.u32 %rd518, [%rd75+4]; bfi.b64 %rd519, %rd518, %rd517, 32, 32; st.local.u64 [%rd6], %rd519; cvta.to.global.u64 %rd520, %rd333; mul.wide.u32 %rd521, %r332, 32; add.s64 %rd76, %rd520, %rd521; ld.global.f32 %f4, [%rd76]; ld.global.f32 %f5, [%rd76+4]; ld.global.f32 %f6, [%rd76+8]; ld.global.f32 %f7, [%rd76+12]; ld.global.f32 %f4864, [%rd76+16]; ld.global.f32 %f4863, [%rd76+20]; ld.global.f32 %f10, [%rd76+24]; add.u64 %rd77, %SPL, 16; st.local.v4.f32 [%rd77], {%f7, %f4864, %f4863, %f10}; ld.global.f32 %f4865, [%rd76+28]; cvta.to.global.u64 %rd523, %rd334; add.s64 %rd78, %rd523, %rd515; ld.global.u32 %r8, [%rd78]; ld.global.u32 %r1706, [%rd78+4]; mul.f32 %f857, %f848, %f848; mov.f32 %f858, 0f40800000; div.rn.f32 %f12, %f858, %f857; div.rn.f32 %f859, %f2, %f848; div.rn.f32 %f860, %f3, %f848; mov.b32 %r333, %f859; and.b32 %r334, %r333, -2147483648; or.b32 %r335, %r334, 1056964608; mov.b32 %f861, %r335; add.rz.f32 %f862, %f859, %f861; cvt.rzi.f32.f32 %f13, %f862; mov.b32 %r336, %f860; and.b32 %r337, %r336, -2147483648; or.b32 %r338, %r337, 1056964608; mov.b32 %f863, %r338; add.rz.f32 %f864, %f860, %f863; cvt.rzi.f32.f32 %f14, %f864; add.f32 %f865, %f13, 0fBF800000; mul.f32 %f867, %f848, %f865; sub.f32 %f15, %f867, %f2; add.u64 %rd79, %SPL, 0; mov.u64 %rd525, 0; st.local.v2.u64 [%rd79], {%rd525, %rd525}; neg.f32 %f869, %f15; div.rn.f32 %f17, %f869, %f848; mov.f32 %f870, 0f3FC00000; sub.f32 %f18, %f870, %f17; mov.f32 %f4813, 0f3F800000; cvt.rzi.f32.f32 %f871, %f4813; add.f32 %f872, %f871, %f871; mov.f32 %f873, 0f40000000; sub.f32 %f874, %f873, %f872; abs.f32 %f19, %f874; abs.f32 %f20, %f18; setp.lt.f32 %p28, %f20, 0f00800000; mul.f32 %f875, %f20, 0f4B800000; selp.f32 %f876, %f875, %f20, %p28; selp.f32 %f877, 0fC1C00000, 0f00000000, %p28; mov.b32 %r339, %f876; add.s32 %r340, %r339, -1060439283; and.b32 %r341, %r340, -8388608; sub.s32 %r342, %r339, %r341; mov.b32 %f878, %r342; cvt.rn.f32.s32 %f879, %r341; mov.f32 %f880, 0f34000000; fma.rn.f32 %f881, %f879, %f880, %f877; add.f32 %f882, %f878, 0fBF800000; add.f32 %f855, %f878, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f854,%f855; // end inline asm add.f32 %f883, %f882, %f882; mul.f32 %f884, %f854, %f883; mul.f32 %f885, %f884, %f884; sub.f32 %f886, %f882, %f884; add.f32 %f887, %f886, %f886; neg.f32 %f888, %f884; fma.rn.f32 %f889, %f888, %f882, %f887; mul.rn.f32 %f890, %f854, %f889; mov.f32 %f891, 0f3B52E7DB; mov.f32 %f892, 0f3A2C32E4; fma.rn.f32 %f893, %f892, %f885, %f891; mov.f32 %f894, 0f3C93BB73; fma.rn.f32 %f895, %f893, %f885, %f894; mov.f32 %f896, 0f3DF6384F; fma.rn.f32 %f897, %f895, %f885, %f896; mul.rn.f32 %f898, %f897, %f885; mov.f32 %f899, 0f3FB8AA3B; fma.rn.f32 %f900, %f884, %f899, %f881; sub.f32 %f901, %f881, %f900; fma.rn.f32 %f902, %f884, %f899, %f901; fma.rn.f32 %f903, %f890, %f899, %f902; mov.f32 %f904, 0f32A55E34; fma.rn.f32 %f905, %f884, %f904, %f903; mul.f32 %f906, %f898, 0f40400000; fma.rn.f32 %f907, %f906, %f890, %f905; fma.rn.f32 %f908, %f898, %f884, %f907; add.rn.f32 %f909, %f900, %f908; neg.f32 %f910, %f900; add.rn.f32 %f911, %f909, %f910; neg.f32 %f912, %f911; add.rn.f32 %f913, %f908, %f912; mul.rn.f32 %f914, %f909, %f873; neg.f32 %f915, %f914; fma.rn.f32 %f916, %f909, %f873, %f915; fma.rn.f32 %f917, %f913, %f873, %f916; cvt.rni.f32.f32 %f918, %f914; sub.f32 %f919, %f914, %f918; add.f32 %f920, %f917, %f919; mov.f32 %f921, 0f3AAF85ED; mov.f32 %f922, 0f391FCB8E; fma.rn.f32 %f923, %f922, %f920, %f921; mov.f32 %f924, 0f3C1D9856; fma.rn.f32 %f925, %f923, %f920, %f924; mov.f32 %f926, 0f3D6357BB; fma.rn.f32 %f927, %f925, %f920, %f926; mov.f32 %f928, 0f3E75FDEC; fma.rn.f32 %f929, %f927, %f920, %f928; mov.f32 %f930, 0f3F317218; fma.rn.f32 %f931, %f929, %f920, %f930; fma.rn.f32 %f932, %f931, %f920, %f4813; cvt.rzi.s32.f32 %r343, %f918; setp.gt.f32 %p29, %f918, 0f00000000; selp.b32 %r344, 0, -2097152000, %p29; add.s32 %r345, %r344, 2130706432; mov.b32 %f933, %r345; mul.f32 %f934, %f932, %f933; shl.b32 %r346, %r343, 23; sub.s32 %r347, %r346, %r344; mov.b32 %f935, %r347; mul.f32 %f936, %f934, %f935; abs.f32 %f937, %f914; setp.gt.f32 %p30, %f937, 0f43180000; setp.lt.f32 %p31, %f914, 0f00000000; selp.f32 %f938, 0f00000000, 0f7F800000, %p31; selp.f32 %f21, %f938, %f936, %p30; setp.eq.f32 %p32, %f18, 0f3F800000; mov.f32 %f4812, %f4813; @%p32 bra $L__BB0_39; mov.f32 %f4769, 0f3FC00000; sub.f32 %f4768, %f4769, %f17; abs.f32 %f4767, %f4768; setp.gtu.f32 %p33, %f4767, 0f7F800000; @%p33 bra $L__BB0_38; bra.uni $L__BB0_34; $L__BB0_38: mov.f32 %f4796, 0f3FC00000; sub.f32 %f4795, %f4796, %f17; mov.f32 %f941, 0f40000000; add.rn.f32 %f4812, %f4795, %f941; bra.uni $L__BB0_39; $L__BB0_34: mov.f32 %f4772, 0f3FC00000; sub.f32 %f4771, %f4772, %f17; abs.f32 %f4770, %f4771; setp.eq.f32 %p34, %f4771, 0f00000000; setp.eq.f32 %p35, %f4770, 0f7F800000; or.pred %p36, %p34, %p35; @%p36 bra $L__BB0_37; bra.uni $L__BB0_35; $L__BB0_37: mov.f32 %f4794, 0f3FC00000; sub.f32 %f4793, %f4794, %f17; setp.eq.f32 %p39, %f19, 0f3F800000; add.f32 %f940, %f4793, %f4793; mov.b32 %r348, %f940; and.b32 %r349, %r348, 2147483647; selp.b32 %r350, %r348, %r349, %p39; mov.b32 %f4812, %r350; bra.uni $L__BB0_39; $L__BB0_35: mov.f32 %f4792, 0f3FC00000; sub.f32 %f4791, %f4792, %f17; setp.geu.f32 %p37, %f4791, 0f00000000; mov.f32 %f4812, %f21; @%p37 bra $L__BB0_39; setp.eq.f32 %p38, %f19, 0f3F800000; neg.f32 %f939, %f21; selp.f32 %f4812, %f939, %f21, %p38; $L__BB0_39: add.f32 %f26, %f17, 0fBF800000; abs.f32 %f27, %f26; setp.lt.f32 %p40, %f27, 0f00800000; mul.f32 %f945, %f27, 0f4B800000; selp.f32 %f946, %f945, %f27, %p40; selp.f32 %f947, 0fC1C00000, 0f00000000, %p40; mov.b32 %r351, %f946; add.s32 %r352, %r351, -1060439283; and.b32 %r353, %r352, -8388608; sub.s32 %r354, %r351, %r353; mov.b32 %f948, %r354; cvt.rn.f32.s32 %f949, %r353; fma.rn.f32 %f951, %f949, %f880, %f947; add.f32 %f952, %f948, 0fBF800000; add.f32 %f943, %f948, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f942,%f943; // end inline asm add.f32 %f953, %f952, %f952; mul.f32 %f955, %f942, %f953; mul.f32 %f956, %f955, %f955; sub.f32 %f957, %f952, %f955; add.f32 %f958, %f957, %f957; neg.f32 %f959, %f955; fma.rn.f32 %f960, %f959, %f952, %f958; mul.rn.f32 %f961, %f942, %f960; fma.rn.f32 %f964, %f892, %f956, %f891; fma.rn.f32 %f966, %f964, %f956, %f894; fma.rn.f32 %f968, %f966, %f956, %f896; mul.rn.f32 %f969, %f968, %f956; fma.rn.f32 %f971, %f955, %f899, %f951; sub.f32 %f972, %f951, %f971; fma.rn.f32 %f973, %f955, %f899, %f972; fma.rn.f32 %f974, %f961, %f899, %f973; fma.rn.f32 %f976, %f955, %f904, %f974; mul.f32 %f977, %f969, 0f40400000; fma.rn.f32 %f978, %f977, %f961, %f976; fma.rn.f32 %f979, %f969, %f955, %f978; add.rn.f32 %f980, %f971, %f979; neg.f32 %f981, %f971; add.rn.f32 %f982, %f980, %f981; neg.f32 %f983, %f982; add.rn.f32 %f984, %f979, %f983; mul.rn.f32 %f985, %f980, %f873; neg.f32 %f986, %f985; fma.rn.f32 %f987, %f980, %f873, %f986; fma.rn.f32 %f988, %f984, %f873, %f987; cvt.rni.f32.f32 %f989, %f985; sub.f32 %f990, %f985, %f989; add.f32 %f991, %f988, %f990; fma.rn.f32 %f994, %f922, %f991, %f921; fma.rn.f32 %f996, %f994, %f991, %f924; fma.rn.f32 %f998, %f996, %f991, %f926; fma.rn.f32 %f1000, %f998, %f991, %f928; fma.rn.f32 %f1002, %f1000, %f991, %f930; fma.rn.f32 %f1003, %f1002, %f991, %f4813; cvt.rzi.s32.f32 %r355, %f989; setp.gt.f32 %p41, %f989, 0f00000000; selp.b32 %r356, 0, -2097152000, %p41; add.s32 %r357, %r356, 2130706432; mov.b32 %f1004, %r357; mul.f32 %f1005, %f1003, %f1004; shl.b32 %r358, %r355, 23; sub.s32 %r359, %r358, %r356; mov.b32 %f1006, %r359; mul.f32 %f1007, %f1005, %f1006; abs.f32 %f1008, %f985; setp.gt.f32 %p42, %f1008, 0f43180000; setp.lt.f32 %p43, %f985, 0f00000000; selp.f32 %f1009, 0f00000000, 0f7F800000, %p43; selp.f32 %f28, %f1009, %f1007, %p42; setp.eq.f32 %p44, %f26, 0f3F800000; @%p44 bra $L__BB0_46; setp.gtu.f32 %p45, %f27, 0f7F800000; @%p45 bra $L__BB0_45; bra.uni $L__BB0_41; $L__BB0_45: mov.f32 %f1012, 0f40000000; add.rn.f32 %f4813, %f26, %f1012; bra.uni $L__BB0_46; $L__BB0_41: setp.eq.f32 %p46, %f26, 0f00000000; setp.eq.f32 %p47, %f27, 0f7F800000; or.pred %p48, %p46, %p47; @%p48 bra $L__BB0_44; bra.uni $L__BB0_42; $L__BB0_44: setp.eq.f32 %p51, %f19, 0f3F800000; add.f32 %f1011, %f26, %f26; mov.b32 %r360, %f1011; and.b32 %r361, %r360, 2147483647; selp.b32 %r362, %r360, %r361, %p51; mov.b32 %f4813, %r362; bra.uni $L__BB0_46; $L__BB0_42: setp.geu.f32 %p49, %f26, 0f00000000; mov.f32 %f4813, %f28; @%p49 bra $L__BB0_46; setp.eq.f32 %p50, %f19, 0f3F800000; neg.f32 %f1010, %f28; selp.f32 %f4813, %f1010, %f28, %p50; $L__BB0_46: add.f32 %f33, %f17, 0fBF000000; abs.f32 %f34, %f33; setp.lt.f32 %p52, %f34, 0f00800000; mul.f32 %f1016, %f34, 0f4B800000; selp.f32 %f1017, %f1016, %f34, %p52; selp.f32 %f1018, 0fC1C00000, 0f00000000, %p52; mov.b32 %r363, %f1017; add.s32 %r364, %r363, -1060439283; and.b32 %r365, %r364, -8388608; sub.s32 %r366, %r363, %r365; mov.b32 %f1019, %r366; cvt.rn.f32.s32 %f1020, %r365; mov.f32 %f1021, 0f34000000; fma.rn.f32 %f1022, %f1020, %f1021, %f1018; add.f32 %f1023, %f1019, 0fBF800000; add.f32 %f1014, %f1019, 0f3F800000; mov.f32 %f4815, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f1013,%f1014; // end inline asm add.f32 %f1024, %f1023, %f1023; mov.f32 %f1025, 0f40000000; mul.f32 %f1026, %f1013, %f1024; mul.f32 %f1027, %f1026, %f1026; sub.f32 %f1028, %f1023, %f1026; add.f32 %f1029, %f1028, %f1028; neg.f32 %f1030, %f1026; fma.rn.f32 %f1031, %f1030, %f1023, %f1029; mul.rn.f32 %f1032, %f1013, %f1031; mov.f32 %f1033, 0f3B52E7DB; mov.f32 %f1034, 0f3A2C32E4; fma.rn.f32 %f1035, %f1034, %f1027, %f1033; mov.f32 %f1036, 0f3C93BB73; fma.rn.f32 %f1037, %f1035, %f1027, %f1036; mov.f32 %f1038, 0f3DF6384F; fma.rn.f32 %f1039, %f1037, %f1027, %f1038; mul.rn.f32 %f1040, %f1039, %f1027; mov.f32 %f1041, 0f3FB8AA3B; fma.rn.f32 %f1042, %f1026, %f1041, %f1022; sub.f32 %f1043, %f1022, %f1042; fma.rn.f32 %f1044, %f1026, %f1041, %f1043; fma.rn.f32 %f1045, %f1032, %f1041, %f1044; mov.f32 %f1046, 0f32A55E34; fma.rn.f32 %f1047, %f1026, %f1046, %f1045; mul.f32 %f1048, %f1040, 0f40400000; fma.rn.f32 %f1049, %f1048, %f1032, %f1047; fma.rn.f32 %f1050, %f1040, %f1026, %f1049; add.rn.f32 %f1051, %f1042, %f1050; neg.f32 %f1052, %f1042; add.rn.f32 %f1053, %f1051, %f1052; neg.f32 %f1054, %f1053; add.rn.f32 %f1055, %f1050, %f1054; mul.rn.f32 %f1056, %f1051, %f1025; neg.f32 %f1057, %f1056; fma.rn.f32 %f1058, %f1051, %f1025, %f1057; fma.rn.f32 %f1059, %f1055, %f1025, %f1058; cvt.rni.f32.f32 %f1060, %f1056; sub.f32 %f1061, %f1056, %f1060; add.f32 %f1062, %f1059, %f1061; mov.f32 %f1063, 0f3AAF85ED; mov.f32 %f1064, 0f391FCB8E; fma.rn.f32 %f1065, %f1064, %f1062, %f1063; mov.f32 %f1066, 0f3C1D9856; fma.rn.f32 %f1067, %f1065, %f1062, %f1066; mov.f32 %f1068, 0f3D6357BB; fma.rn.f32 %f1069, %f1067, %f1062, %f1068; mov.f32 %f1070, 0f3E75FDEC; fma.rn.f32 %f1071, %f1069, %f1062, %f1070; mov.f32 %f1072, 0f3F317218; fma.rn.f32 %f1073, %f1071, %f1062, %f1072; fma.rn.f32 %f1074, %f1073, %f1062, %f4815; cvt.rzi.s32.f32 %r367, %f1060; setp.gt.f32 %p53, %f1060, 0f00000000; selp.b32 %r368, 0, -2097152000, %p53; add.s32 %r369, %r368, 2130706432; mov.b32 %f1075, %r369; mul.f32 %f1076, %f1074, %f1075; shl.b32 %r370, %r367, 23; sub.s32 %r371, %r370, %r368; mov.b32 %f1077, %r371; mul.f32 %f1078, %f1076, %f1077; abs.f32 %f1079, %f1056; setp.gt.f32 %p54, %f1079, 0f43180000; setp.lt.f32 %p55, %f1056, 0f00000000; selp.f32 %f1080, 0f00000000, 0f7F800000, %p55; selp.f32 %f35, %f1080, %f1078, %p54; setp.eq.f32 %p56, %f33, 0f3F800000; mov.f32 %f4814, %f4815; @%p56 bra $L__BB0_53; add.f32 %f4798, %f17, 0fBF000000; abs.f32 %f4797, %f4798; setp.gtu.f32 %p57, %f4797, 0f7F800000; @%p57 bra $L__BB0_52; bra.uni $L__BB0_48; $L__BB0_52: add.f32 %f4804, %f17, 0fBF000000; mov.f32 %f1083, 0f40000000; add.rn.f32 %f4814, %f4804, %f1083; bra.uni $L__BB0_53; $L__BB0_48: add.f32 %f4801, %f17, 0fBF000000; add.f32 %f4800, %f17, 0fBF000000; abs.f32 %f4799, %f4800; setp.eq.f32 %p58, %f4800, 0f00000000; setp.eq.f32 %p59, %f4799, 0f7F800000; or.pred %p60, %p58, %p59; @%p60 bra $L__BB0_51; bra.uni $L__BB0_49; $L__BB0_51: add.f32 %f4803, %f17, 0fBF000000; setp.eq.f32 %p63, %f19, 0f3F800000; add.f32 %f1082, %f4803, %f4803; mov.b32 %r372, %f1082; and.b32 %r373, %r372, 2147483647; selp.b32 %r374, %r372, %r373, %p63; mov.b32 %f4814, %r374; bra.uni $L__BB0_53; $L__BB0_49: add.f32 %f4802, %f17, 0fBF000000; setp.geu.f32 %p61, %f4802, 0f00000000; mov.f32 %f4814, %f35; @%p61 bra $L__BB0_53; setp.eq.f32 %p62, %f19, 0f3F800000; neg.f32 %f1081, %f35; selp.f32 %f4814, %f1081, %f35, %p62; $L__BB0_53: add.f32 %f4775, %f14, 0fBF800000; mul.f32 %f4774, %f848, %f4775; sub.f32 %f4773, %f4774, %f3; neg.f32 %f1087, %f4773; div.rn.f32 %f40, %f1087, %f848; mov.f32 %f1088, 0f3FC00000; sub.f32 %f41, %f1088, %f40; abs.f32 %f42, %f41; setp.lt.f32 %p64, %f42, 0f00800000; mul.f32 %f1089, %f42, 0f4B800000; selp.f32 %f1090, %f1089, %f42, %p64; selp.f32 %f1091, 0fC1C00000, 0f00000000, %p64; mov.b32 %r375, %f1090; add.s32 %r376, %r375, -1060439283; and.b32 %r377, %r376, -8388608; sub.s32 %r378, %r375, %r377; mov.b32 %f1092, %r378; cvt.rn.f32.s32 %f1093, %r377; fma.rn.f32 %f1095, %f1093, %f1021, %f1091; add.f32 %f1096, %f1092, 0fBF800000; add.f32 %f1085, %f1092, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f1084,%f1085; // end inline asm add.f32 %f1097, %f1096, %f1096; mul.f32 %f1099, %f1084, %f1097; mul.f32 %f1100, %f1099, %f1099; sub.f32 %f1101, %f1096, %f1099; add.f32 %f1102, %f1101, %f1101; neg.f32 %f1103, %f1099; fma.rn.f32 %f1104, %f1103, %f1096, %f1102; mul.rn.f32 %f1105, %f1084, %f1104; fma.rn.f32 %f1108, %f1034, %f1100, %f1033; fma.rn.f32 %f1110, %f1108, %f1100, %f1036; fma.rn.f32 %f1112, %f1110, %f1100, %f1038; mul.rn.f32 %f1113, %f1112, %f1100; fma.rn.f32 %f1115, %f1099, %f1041, %f1095; sub.f32 %f1116, %f1095, %f1115; fma.rn.f32 %f1117, %f1099, %f1041, %f1116; fma.rn.f32 %f1118, %f1105, %f1041, %f1117; fma.rn.f32 %f1120, %f1099, %f1046, %f1118; mul.f32 %f1121, %f1113, 0f40400000; fma.rn.f32 %f1122, %f1121, %f1105, %f1120; fma.rn.f32 %f1123, %f1113, %f1099, %f1122; add.rn.f32 %f1124, %f1115, %f1123; neg.f32 %f1125, %f1115; add.rn.f32 %f1126, %f1124, %f1125; neg.f32 %f1127, %f1126; add.rn.f32 %f1128, %f1123, %f1127; mul.rn.f32 %f1129, %f1124, %f1025; neg.f32 %f1130, %f1129; fma.rn.f32 %f1131, %f1124, %f1025, %f1130; fma.rn.f32 %f1132, %f1128, %f1025, %f1131; cvt.rni.f32.f32 %f1133, %f1129; sub.f32 %f1134, %f1129, %f1133; add.f32 %f1135, %f1132, %f1134; fma.rn.f32 %f1138, %f1064, %f1135, %f1063; fma.rn.f32 %f1140, %f1138, %f1135, %f1066; fma.rn.f32 %f1142, %f1140, %f1135, %f1068; fma.rn.f32 %f1144, %f1142, %f1135, %f1070; fma.rn.f32 %f1146, %f1144, %f1135, %f1072; fma.rn.f32 %f1147, %f1146, %f1135, %f4815; cvt.rzi.s32.f32 %r379, %f1133; setp.gt.f32 %p65, %f1133, 0f00000000; selp.b32 %r380, 0, -2097152000, %p65; add.s32 %r381, %r380, 2130706432; mov.b32 %f1148, %r381; mul.f32 %f1149, %f1147, %f1148; shl.b32 %r382, %r379, 23; sub.s32 %r383, %r382, %r380; mov.b32 %f1150, %r383; mul.f32 %f1151, %f1149, %f1150; abs.f32 %f1152, %f1129; setp.gt.f32 %p66, %f1152, 0f43180000; setp.lt.f32 %p67, %f1129, 0f00000000; selp.f32 %f1153, 0f00000000, 0f7F800000, %p67; selp.f32 %f43, %f1153, %f1151, %p66; setp.eq.f32 %p68, %f41, 0f3F800000; @%p68 bra $L__BB0_60; setp.gtu.f32 %p69, %f42, 0f7F800000; @%p69 bra $L__BB0_59; bra.uni $L__BB0_55; $L__BB0_59: mov.f32 %f1156, 0f40000000; add.rn.f32 %f4815, %f41, %f1156; bra.uni $L__BB0_60; $L__BB0_55: setp.eq.f32 %p70, %f41, 0f00000000; setp.eq.f32 %p71, %f42, 0f7F800000; or.pred %p72, %p70, %p71; @%p72 bra $L__BB0_58; bra.uni $L__BB0_56; $L__BB0_58: setp.eq.f32 %p75, %f19, 0f3F800000; add.f32 %f1155, %f41, %f41; mov.b32 %r384, %f1155; and.b32 %r385, %r384, 2147483647; selp.b32 %r386, %r384, %r385, %p75; mov.b32 %f4815, %r386; bra.uni $L__BB0_60; $L__BB0_56: setp.geu.f32 %p73, %f41, 0f00000000; mov.f32 %f4815, %f43; @%p73 bra $L__BB0_60; setp.eq.f32 %p74, %f19, 0f3F800000; neg.f32 %f1154, %f43; selp.f32 %f4815, %f1154, %f43, %p74; $L__BB0_60: add.f32 %f48, %f40, 0fBF800000; abs.f32 %f49, %f48; setp.lt.f32 %p76, %f49, 0f00800000; mul.f32 %f1160, %f49, 0f4B800000; selp.f32 %f1161, %f1160, %f49, %p76; selp.f32 %f1162, 0fC1C00000, 0f00000000, %p76; mov.b32 %r387, %f1161; add.s32 %r388, %r387, -1060439283; and.b32 %r389, %r388, -8388608; sub.s32 %r390, %r387, %r389; mov.b32 %f1163, %r390; cvt.rn.f32.s32 %f1164, %r389; mov.f32 %f1165, 0f34000000; fma.rn.f32 %f1166, %f1164, %f1165, %f1162; add.f32 %f1167, %f1163, 0fBF800000; add.f32 %f1158, %f1163, 0f3F800000; mov.f32 %f4817, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f1157,%f1158; // end inline asm add.f32 %f1168, %f1167, %f1167; mov.f32 %f1169, 0f40000000; mul.f32 %f1170, %f1157, %f1168; mul.f32 %f1171, %f1170, %f1170; sub.f32 %f1172, %f1167, %f1170; add.f32 %f1173, %f1172, %f1172; neg.f32 %f1174, %f1170; fma.rn.f32 %f1175, %f1174, %f1167, %f1173; mul.rn.f32 %f1176, %f1157, %f1175; mov.f32 %f1177, 0f3B52E7DB; mov.f32 %f1178, 0f3A2C32E4; fma.rn.f32 %f1179, %f1178, %f1171, %f1177; mov.f32 %f1180, 0f3C93BB73; fma.rn.f32 %f1181, %f1179, %f1171, %f1180; mov.f32 %f1182, 0f3DF6384F; fma.rn.f32 %f1183, %f1181, %f1171, %f1182; mul.rn.f32 %f1184, %f1183, %f1171; mov.f32 %f1185, 0f3FB8AA3B; fma.rn.f32 %f1186, %f1170, %f1185, %f1166; sub.f32 %f1187, %f1166, %f1186; fma.rn.f32 %f1188, %f1170, %f1185, %f1187; fma.rn.f32 %f1189, %f1176, %f1185, %f1188; mov.f32 %f1190, 0f32A55E34; fma.rn.f32 %f1191, %f1170, %f1190, %f1189; mul.f32 %f1192, %f1184, 0f40400000; fma.rn.f32 %f1193, %f1192, %f1176, %f1191; fma.rn.f32 %f1194, %f1184, %f1170, %f1193; add.rn.f32 %f1195, %f1186, %f1194; neg.f32 %f1196, %f1186; add.rn.f32 %f1197, %f1195, %f1196; neg.f32 %f1198, %f1197; add.rn.f32 %f1199, %f1194, %f1198; mul.rn.f32 %f1200, %f1195, %f1169; neg.f32 %f1201, %f1200; fma.rn.f32 %f1202, %f1195, %f1169, %f1201; fma.rn.f32 %f1203, %f1199, %f1169, %f1202; cvt.rni.f32.f32 %f1204, %f1200; sub.f32 %f1205, %f1200, %f1204; add.f32 %f1206, %f1203, %f1205; mov.f32 %f1207, 0f3AAF85ED; mov.f32 %f1208, 0f391FCB8E; fma.rn.f32 %f1209, %f1208, %f1206, %f1207; mov.f32 %f1210, 0f3C1D9856; fma.rn.f32 %f1211, %f1209, %f1206, %f1210; mov.f32 %f1212, 0f3D6357BB; fma.rn.f32 %f1213, %f1211, %f1206, %f1212; mov.f32 %f1214, 0f3E75FDEC; fma.rn.f32 %f1215, %f1213, %f1206, %f1214; mov.f32 %f1216, 0f3F317218; fma.rn.f32 %f1217, %f1215, %f1206, %f1216; fma.rn.f32 %f1218, %f1217, %f1206, %f4817; cvt.rzi.s32.f32 %r391, %f1204; setp.gt.f32 %p77, %f1204, 0f00000000; selp.b32 %r392, 0, -2097152000, %p77; add.s32 %r393, %r392, 2130706432; mov.b32 %f1219, %r393; mul.f32 %f1220, %f1218, %f1219; shl.b32 %r394, %r391, 23; sub.s32 %r395, %r394, %r392; mov.b32 %f1221, %r395; mul.f32 %f1222, %f1220, %f1221; abs.f32 %f1223, %f1200; setp.gt.f32 %p78, %f1223, 0f43180000; setp.lt.f32 %p79, %f1200, 0f00000000; selp.f32 %f1224, 0f00000000, 0f7F800000, %p79; selp.f32 %f50, %f1224, %f1222, %p78; setp.eq.f32 %p80, %f48, 0f3F800000; mov.f32 %f4816, %f4817; @%p80 bra $L__BB0_67; add.f32 %f4754, %f40, 0fBF800000; abs.f32 %f4753, %f4754; setp.gtu.f32 %p81, %f4753, 0f7F800000; @%p81 bra $L__BB0_66; bra.uni $L__BB0_62; $L__BB0_66: add.f32 %f4760, %f40, 0fBF800000; mov.f32 %f1227, 0f40000000; add.rn.f32 %f4816, %f4760, %f1227; bra.uni $L__BB0_67; $L__BB0_62: add.f32 %f4757, %f40, 0fBF800000; add.f32 %f4756, %f40, 0fBF800000; abs.f32 %f4755, %f4756; setp.eq.f32 %p82, %f4756, 0f00000000; setp.eq.f32 %p83, %f4755, 0f7F800000; or.pred %p84, %p82, %p83; @%p84 bra $L__BB0_65; bra.uni $L__BB0_63; $L__BB0_65: add.f32 %f4759, %f40, 0fBF800000; setp.eq.f32 %p87, %f19, 0f3F800000; add.f32 %f1226, %f4759, %f4759; mov.b32 %r396, %f1226; and.b32 %r397, %r396, 2147483647; selp.b32 %r398, %r396, %r397, %p87; mov.b32 %f4816, %r398; bra.uni $L__BB0_67; $L__BB0_63: add.f32 %f4758, %f40, 0fBF800000; setp.geu.f32 %p85, %f4758, 0f00000000; mov.f32 %f4816, %f50; @%p85 bra $L__BB0_67; setp.eq.f32 %p86, %f19, 0f3F800000; neg.f32 %f1225, %f50; selp.f32 %f4816, %f1225, %f50, %p86; $L__BB0_67: add.f32 %f55, %f40, 0fBF000000; abs.f32 %f56, %f55; setp.lt.f32 %p88, %f56, 0f00800000; mul.f32 %f1231, %f56, 0f4B800000; selp.f32 %f1232, %f1231, %f56, %p88; selp.f32 %f1233, 0fC1C00000, 0f00000000, %p88; mov.b32 %r399, %f1232; add.s32 %r400, %r399, -1060439283; and.b32 %r401, %r400, -8388608; sub.s32 %r402, %r399, %r401; mov.b32 %f1234, %r402; cvt.rn.f32.s32 %f1235, %r401; fma.rn.f32 %f1237, %f1235, %f1165, %f1233; add.f32 %f1238, %f1234, 0fBF800000; add.f32 %f1229, %f1234, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f1228,%f1229; // end inline asm add.f32 %f1239, %f1238, %f1238; mul.f32 %f1241, %f1228, %f1239; mul.f32 %f1242, %f1241, %f1241; sub.f32 %f1243, %f1238, %f1241; add.f32 %f1244, %f1243, %f1243; neg.f32 %f1245, %f1241; fma.rn.f32 %f1246, %f1245, %f1238, %f1244; mul.rn.f32 %f1247, %f1228, %f1246; fma.rn.f32 %f1250, %f1178, %f1242, %f1177; fma.rn.f32 %f1252, %f1250, %f1242, %f1180; fma.rn.f32 %f1254, %f1252, %f1242, %f1182; mul.rn.f32 %f1255, %f1254, %f1242; fma.rn.f32 %f1257, %f1241, %f1185, %f1237; sub.f32 %f1258, %f1237, %f1257; fma.rn.f32 %f1259, %f1241, %f1185, %f1258; fma.rn.f32 %f1260, %f1247, %f1185, %f1259; fma.rn.f32 %f1262, %f1241, %f1190, %f1260; mul.f32 %f1263, %f1255, 0f40400000; fma.rn.f32 %f1264, %f1263, %f1247, %f1262; fma.rn.f32 %f1265, %f1255, %f1241, %f1264; add.rn.f32 %f1266, %f1257, %f1265; neg.f32 %f1267, %f1257; add.rn.f32 %f1268, %f1266, %f1267; neg.f32 %f1269, %f1268; add.rn.f32 %f1270, %f1265, %f1269; mul.rn.f32 %f1271, %f1266, %f1169; neg.f32 %f1272, %f1271; fma.rn.f32 %f1273, %f1266, %f1169, %f1272; fma.rn.f32 %f1274, %f1270, %f1169, %f1273; cvt.rni.f32.f32 %f1275, %f1271; sub.f32 %f1276, %f1271, %f1275; add.f32 %f1277, %f1274, %f1276; fma.rn.f32 %f1280, %f1208, %f1277, %f1207; fma.rn.f32 %f1282, %f1280, %f1277, %f1210; fma.rn.f32 %f1284, %f1282, %f1277, %f1212; fma.rn.f32 %f1286, %f1284, %f1277, %f1214; fma.rn.f32 %f1288, %f1286, %f1277, %f1216; fma.rn.f32 %f1289, %f1288, %f1277, %f4817; cvt.rzi.s32.f32 %r403, %f1275; setp.gt.f32 %p89, %f1275, 0f00000000; selp.b32 %r404, 0, -2097152000, %p89; add.s32 %r405, %r404, 2130706432; mov.b32 %f1290, %r405; mul.f32 %f1291, %f1289, %f1290; shl.b32 %r406, %r403, 23; sub.s32 %r407, %r406, %r404; mov.b32 %f1292, %r407; mul.f32 %f1293, %f1291, %f1292; abs.f32 %f1294, %f1271; setp.gt.f32 %p90, %f1294, 0f43180000; setp.lt.f32 %p91, %f1271, 0f00000000; selp.f32 %f1295, 0f00000000, 0f7F800000, %p91; selp.f32 %f57, %f1295, %f1293, %p90; setp.eq.f32 %p92, %f55, 0f3F800000; @%p92 bra $L__BB0_74; setp.gtu.f32 %p93, %f56, 0f7F800000; @%p93 bra $L__BB0_73; bra.uni $L__BB0_69; $L__BB0_73: mov.f32 %f1298, 0f40000000; add.rn.f32 %f4817, %f55, %f1298; bra.uni $L__BB0_74; $L__BB0_69: setp.eq.f32 %p94, %f55, 0f00000000; setp.eq.f32 %p95, %f56, 0f7F800000; or.pred %p96, %p94, %p95; @%p96 bra $L__BB0_72; bra.uni $L__BB0_70; $L__BB0_72: setp.eq.f32 %p99, %f19, 0f3F800000; add.f32 %f1297, %f55, %f55; mov.b32 %r408, %f1297; and.b32 %r409, %r408, 2147483647; selp.b32 %r410, %r408, %r409, %p99; mov.b32 %f4817, %r410; bra.uni $L__BB0_74; $L__BB0_70: setp.geu.f32 %p97, %f55, 0f00000000; mov.f32 %f4817, %f57; @%p97 bra $L__BB0_74; setp.eq.f32 %p98, %f19, 0f3F800000; neg.f32 %f1296, %f57; selp.f32 %f4817, %f1296, %f57, %p98; $L__BB0_74: add.u64 %rd1108, %SPL, 96; add.f32 %f4766, %f13, 0fBF800000; mul.f32 %f4765, %f848, %f4766; sub.f32 %f4764, %f4765, %f2; add.f32 %f4763, %f14, 0fBF800000; mul.f32 %f4762, %f848, %f4763; sub.f32 %f4761, %f4762, %f3; add.u64 %rd1078, %SPL, 0; ld.param.u64 %rd1043, [g2p2g_param_9]; mul.f32 %f1299, %f4812, 0f3F000000; mov.f32 %f1300, 0f3F400000; sub.f32 %f1301, %f1300, %f4813; mul.f32 %f1302, %f4814, 0f3F000000; mul.f32 %f1303, %f4815, 0f3F000000; sub.f32 %f1304, %f1300, %f4816; max.f32 %f1305, %f13, 0fCF000000; cvt.rzi.s32.f32 %r411, %f1305; add.s32 %r412, %r411, -2; setp.gt.f32 %p100, %f13, 0f4EFFFFFF; selp.b32 %r413, 2147483645, %r412, %p100; setp.num.f32 %p101, %f13, %f13; selp.b32 %r414, %r413, -2, %p101; cvt.rn.f32.s32 %f1306, %r414; mul.f32 %f1307, %f1306, 0f3E800000; cvt.rmi.f32.f32 %f1308, %f1307; setp.gt.f32 %p102, %f1308, 0f4EFFFFFF; max.f32 %f1309, %f1308, 0fCF000000; cvt.rzi.s32.f32 %r415, %f1309; setp.num.f32 %p103, %f1308, %f1308; shl.b32 %r416, %r415, 2; selp.b32 %r417, -4, %r416, %p102; selp.b32 %r418, %r417, 0, %p103; sub.s32 %r419, %r414, %r418; max.f32 %f1310, %f14, 0fCF000000; cvt.rzi.s32.f32 %r420, %f1310; add.s32 %r421, %r420, -2; setp.gt.f32 %p104, %f14, 0f4EFFFFFF; selp.b32 %r422, 2147483645, %r421, %p104; setp.num.f32 %p105, %f14, %f14; selp.b32 %r423, %r422, -2, %p105; cvt.rn.f32.s32 %f1311, %r423; mul.f32 %f1312, %f1311, 0f3E800000; cvt.rmi.f32.f32 %f1313, %f1312; setp.gt.f32 %p106, %f1313, 0f4EFFFFFF; max.f32 %f1314, %f1313, 0fCF000000; cvt.rzi.s32.f32 %r424, %f1314; setp.num.f32 %p107, %f1313, %f1313; shl.b32 %r425, %r424, 2; selp.b32 %r426, -4, %r425, %p106; selp.b32 %r427, %r426, 0, %p107; sub.s32 %r428, %r423, %r427; cvt.u64.u32 %rd526, %r428; cvt.u64.u32 %rd527, %r419; bfi.b64 %rd528, %rd526, %rd527, 32, 32; mul.wide.u32 %rd529, %r428, 8; and.b64 %rd530, %rd529, 4294967288; add.s64 %rd531, %rd528, %rd530; add.s64 %rd80, %rd531, 9; add.f32 %f62, %f848, %f848; add.f32 %f1315, %f4764, %f62; add.f32 %f1316, %f4761, %f62; mul.f32 %f1317, %f4817, 0f3F000000; mul.f32 %f1318, %f1302, %f1317; shl.b64 %rd532, %rd80, 6; and.b64 %rd533, %rd532, 274877906880; mov.u64 %rd534, _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E; add.s64 %rd535, %rd534, %rd533; ld.shared.u64 %rd536, [%rd535+1184]; cvt.u32.u64 %r429, %rd536; mov.b32 %f1319, %r429; shr.u64 %rd537, %rd536, 32; cvt.u32.u64 %r430, %rd537; mov.b32 %f1320, %r430; fma.rn.f32 %f1321, %f1318, %f1319, 0f00000000; fma.rn.f32 %f1322, %f1318, %f1320, 0f00000000; mul.f32 %f1323, %f12, %f1318; mul.f32 %f1324, %f1323, %f1319; mul.f32 %f1325, %f1323, %f1320; fma.rn.f32 %f1326, %f1315, %f1324, 0f00000000; fma.rn.f32 %f1327, %f1315, %f1325, 0f00000000; fma.rn.f32 %f1328, %f1316, %f1324, 0f00000000; fma.rn.f32 %f1329, %f1316, %f1325, 0f00000000; ld.shared.f32 %f1330, [%rd535+1188]; mul.f32 %f1331, %f1316, %f1330; fma.rn.f32 %f1332, %f1315, %f1319, %f1331; mul.f32 %f1333, %f1318, %f1332; fma.rn.f32 %f1334, %f12, %f1333, 0f00000000; mul.f32 %f63, %f848, 0f00000000; add.f32 %f1335, %f4761, %f63; ld.shared.u64 %rd538, [%rd535+160]; cvt.u32.u64 %r431, %rd538; mov.b32 %f1336, %r431; mul.f32 %f1337, %f1302, %f1303; shr.u64 %rd539, %rd538, 32; cvt.u32.u64 %r432, %rd539; mov.b32 %f1338, %r432; fma.rn.f32 %f1339, %f1337, %f1336, %f1321; fma.rn.f32 %f1340, %f1337, %f1338, %f1322; mul.f32 %f1341, %f12, %f1337; mul.f32 %f1342, %f1341, %f1336; mul.f32 %f1343, %f1341, %f1338; fma.rn.f32 %f1344, %f1315, %f1342, %f1326; fma.rn.f32 %f1345, %f1315, %f1343, %f1327; fma.rn.f32 %f1346, %f1335, %f1342, %f1328; fma.rn.f32 %f1347, %f1335, %f1343, %f1329; ld.shared.f32 %f1348, [%rd535+164]; mul.f32 %f1349, %f1335, %f1348; fma.rn.f32 %f1350, %f1315, %f1336, %f1349; mul.f32 %f1351, %f1337, %f1350; fma.rn.f32 %f1352, %f12, %f1351, %f1334; ld.shared.u64 %rd540, [%rd535+672]; cvt.u32.u64 %r433, %rd540; mov.b32 %f1353, %r433; mul.f32 %f1354, %f1302, %f1304; shr.u64 %rd541, %rd540, 32; cvt.u32.u64 %r434, %rd541; mov.b32 %f1355, %r434; fma.rn.f32 %f1356, %f1354, %f1353, %f1339; fma.rn.f32 %f1357, %f1354, %f1355, %f1340; mul.f32 %f1358, %f12, %f1354; mul.f32 %f1359, %f1358, %f1353; mul.f32 %f1360, %f1358, %f1355; add.f32 %f1361, %f4761, %f848; fma.rn.f32 %f1362, %f1315, %f1359, %f1344; fma.rn.f32 %f1363, %f1315, %f1360, %f1345; fma.rn.f32 %f1364, %f1361, %f1359, %f1346; fma.rn.f32 %f1365, %f1361, %f1360, %f1347; ld.shared.f32 %f1366, [%rd535+676]; mul.f32 %f1367, %f1361, %f1366; fma.rn.f32 %f1368, %f1315, %f1353, %f1367; mul.f32 %f1369, %f1354, %f1368; fma.rn.f32 %f1370, %f12, %f1369, %f1352; add.f32 %f1371, %f4764, %f63; mul.f32 %f1372, %f1299, %f1317; ld.shared.u64 %rd542, [%rd535+1056]; cvt.u32.u64 %r435, %rd542; mov.b32 %f1373, %r435; shr.u64 %rd543, %rd542, 32; cvt.u32.u64 %r436, %rd543; mov.b32 %f1374, %r436; fma.rn.f32 %f1375, %f1372, %f1373, %f1356; fma.rn.f32 %f1376, %f1372, %f1374, %f1357; mul.f32 %f1377, %f12, %f1372; mul.f32 %f1378, %f1377, %f1373; mul.f32 %f1379, %f1377, %f1374; fma.rn.f32 %f1380, %f1371, %f1378, %f1362; fma.rn.f32 %f1381, %f1371, %f1379, %f1363; fma.rn.f32 %f1382, %f1316, %f1378, %f1364; fma.rn.f32 %f1383, %f1316, %f1379, %f1365; ld.shared.f32 %f1384, [%rd535+1060]; mul.f32 %f1385, %f1316, %f1384; fma.rn.f32 %f1386, %f1371, %f1373, %f1385; mul.f32 %f1387, %f1372, %f1386; fma.rn.f32 %f1388, %f12, %f1387, %f1370; ld.shared.u64 %rd544, [%rd535+32]; cvt.u32.u64 %r437, %rd544; mov.b32 %f1389, %r437; mul.f32 %f1390, %f1299, %f1303; shr.u64 %rd545, %rd544, 32; cvt.u32.u64 %r438, %rd545; mov.b32 %f1391, %r438; fma.rn.f32 %f1392, %f1390, %f1389, %f1375; fma.rn.f32 %f1393, %f1390, %f1391, %f1376; mul.f32 %f1394, %f12, %f1390; mul.f32 %f1395, %f1394, %f1389; mul.f32 %f1396, %f1394, %f1391; fma.rn.f32 %f1397, %f1371, %f1395, %f1380; fma.rn.f32 %f1398, %f1371, %f1396, %f1381; fma.rn.f32 %f1399, %f1335, %f1395, %f1382; fma.rn.f32 %f1400, %f1335, %f1396, %f1383; ld.shared.f32 %f1401, [%rd535+36]; mul.f32 %f1402, %f1335, %f1401; fma.rn.f32 %f1403, %f1371, %f1389, %f1402; mul.f32 %f1404, %f1390, %f1403; fma.rn.f32 %f1405, %f12, %f1404, %f1388; ld.shared.u64 %rd546, [%rd535+544]; cvt.u32.u64 %r439, %rd546; mov.b32 %f1406, %r439; mul.f32 %f1407, %f1299, %f1304; shr.u64 %rd547, %rd546, 32; cvt.u32.u64 %r440, %rd547; mov.b32 %f1408, %r440; fma.rn.f32 %f1409, %f1407, %f1406, %f1392; fma.rn.f32 %f1410, %f1407, %f1408, %f1393; mul.f32 %f1411, %f12, %f1407; mul.f32 %f1412, %f1411, %f1406; mul.f32 %f1413, %f1411, %f1408; fma.rn.f32 %f1414, %f1371, %f1412, %f1397; fma.rn.f32 %f1415, %f1371, %f1413, %f1398; fma.rn.f32 %f1416, %f1361, %f1412, %f1399; fma.rn.f32 %f1417, %f1361, %f1413, %f1400; ld.shared.f32 %f1418, [%rd535+548]; mul.f32 %f1419, %f1361, %f1418; fma.rn.f32 %f1420, %f1371, %f1406, %f1419; mul.f32 %f1421, %f1407, %f1420; fma.rn.f32 %f1422, %f12, %f1421, %f1405; mul.f32 %f1423, %f1301, %f1317; ld.shared.u64 %rd548, [%rd535+1120]; cvt.u32.u64 %r441, %rd548; mov.b32 %f1424, %r441; shr.u64 %rd549, %rd548, 32; cvt.u32.u64 %r442, %rd549; mov.b32 %f1425, %r442; fma.rn.f32 %f1426, %f1423, %f1424, %f1409; fma.rn.f32 %f1427, %f1423, %f1425, %f1410; mul.f32 %f1428, %f12, %f1423; mul.f32 %f1429, %f1428, %f1424; mul.f32 %f1430, %f1428, %f1425; add.f32 %f1431, %f4764, %f848; fma.rn.f32 %f1432, %f1431, %f1429, %f1414; fma.rn.f32 %f1433, %f1431, %f1430, %f1415; fma.rn.f32 %f1434, %f1316, %f1429, %f1416; fma.rn.f32 %f1435, %f1316, %f1430, %f1417; ld.shared.f32 %f1436, [%rd535+1124]; mul.f32 %f1437, %f1316, %f1436; fma.rn.f32 %f1438, %f1431, %f1424, %f1437; mul.f32 %f1439, %f1423, %f1438; fma.rn.f32 %f1440, %f12, %f1439, %f1422; ld.shared.u64 %rd550, [%rd535+96]; cvt.u32.u64 %r443, %rd550; mov.b32 %f1441, %r443; mul.f32 %f1442, %f1301, %f1303; shr.u64 %rd551, %rd550, 32; cvt.u32.u64 %r444, %rd551; mov.b32 %f1443, %r444; fma.rn.f32 %f1444, %f1442, %f1441, %f1426; fma.rn.f32 %f1445, %f1442, %f1443, %f1427; mul.f32 %f1446, %f12, %f1442; mul.f32 %f1447, %f1446, %f1441; mul.f32 %f1448, %f1446, %f1443; fma.rn.f32 %f1449, %f1431, %f1447, %f1432; fma.rn.f32 %f1450, %f1431, %f1448, %f1433; fma.rn.f32 %f1451, %f1335, %f1447, %f1434; fma.rn.f32 %f1452, %f1335, %f1448, %f1435; ld.shared.f32 %f1453, [%rd535+100]; mul.f32 %f1454, %f1335, %f1453; fma.rn.f32 %f1455, %f1431, %f1441, %f1454; mul.f32 %f1456, %f1442, %f1455; fma.rn.f32 %f1457, %f12, %f1456, %f1440; ld.shared.u64 %rd552, [%rd535+608]; cvt.u32.u64 %r445, %rd552; mov.b32 %f1458, %r445; mul.f32 %f1459, %f1301, %f1304; shr.u64 %rd553, %rd552, 32; cvt.u32.u64 %r446, %rd553; mov.b32 %f1460, %r446; fma.rn.f32 %f4819, %f1459, %f1458, %f1444; fma.rn.f32 %f4818, %f1459, %f1460, %f1445; mul.f32 %f1461, %f12, %f1459; mul.f32 %f1462, %f1461, %f1458; mul.f32 %f1463, %f1461, %f1460; fma.rn.f32 %f4866, %f1431, %f1462, %f1449; fma.rn.f32 %f4867, %f1431, %f1463, %f1450; fma.rn.f32 %f4868, %f1361, %f1462, %f1451; fma.rn.f32 %f4869, %f1361, %f1463, %f1452; ld.shared.f32 %f1464, [%rd535+612]; mul.f32 %f1465, %f1361, %f1464; fma.rn.f32 %f1466, %f1431, %f1458, %f1465; mul.f32 %f1467, %f1459, %f1466; fma.rn.f32 %f70, %f12, %f1467, %f1457; st.local.v4.f32 [%rd1078], {%f4866, %f4867, %f4868, %f4869}; mov.b32 %r447, %f4819; mov.b32 %r448, %f4818; st.local.v2.f32 [%rd1108], {%f4819, %f4818}; cvta.to.global.u64 %rd554, %rd1043; mul.lo.s64 %rd555, %rd73, 96; add.s64 %rd81, %rd554, %rd555; ld.global.u32 %r10, [%rd81]; mov.b64 %rd1137, {%r447, %r448}; setp.eq.s16 %p108, %rs11, 0; @%p108 bra $L__BB0_76; add.u64 %rd1110, %SPL, 96; st.local.u64 [%rd1110], %rd72; cvt.u32.u64 %r449, %rd72; mov.b32 %f4819, %r449; shr.u64 %rd556, %rd72, 32; cvt.u32.u64 %r450, %rd556; mov.b32 %f4818, %r450; mov.u64 %rd1137, %rd72; $L__BB0_76: add.u64 %rd1129, %SPL, 96; add.u64 %rd1131, %SP, 96; add.u64 %rd1172, %SPL, 64; add.u64 %rd1166, %SPL, 32; add.s64 %rd1135, %rd1129, 8; add.u64 %rd89, %SPL, 32; mov.u64 %rd1136, 2; mov.u64 %rd1130, %rd1129; mov.u64 %rd1132, %rd1129; mov.u64 %rd1133, %rd1129; mov.u64 %rd1134, %rd1131; $L__BB0_77: setp.eq.s64 %p109, %rd1136, 0; @%p109 bra $L__BB0_80; add.s64 %rd1136, %rd1136, -1; add.s64 %rd561, %rd1129, 8; setp.eq.s64 %p110, %rd1132, %rd1135; selp.b64 %rd1129, %rd561, %rd1129, %p110; add.s64 %rd562, %rd1130, 8; selp.b64 %rd1130, %rd562, %rd1130, %p110; add.s64 %rd563, %rd1131, 8; selp.b64 %rd1131, %rd563, %rd1131, %p110; selp.b64 %rd564, %rd561, %rd1132, %p110; selp.b64 %rd565, %rd562, %rd1133, %p110; selp.b64 %rd566, %rd563, %rd1134, %p110; add.s64 %rd567, %rd1132, 8; selp.b64 %rd1135, %rd567, %rd1135, %p110; setp.eq.s64 %p111, %rd1136, 0; add.s64 %rd568, %rd564, 4; add.s64 %rd569, %rd565, 4; add.s64 %rd570, %rd566, 4; selp.b64 %rd1132, %rd564, %rd568, %p111; selp.b64 %rd1133, %rd565, %rd569, %p111; selp.b64 %rd1134, %rd566, %rd570, %p111; ld.local.f32 %f1468, [%rd565]; abs.f32 %f1469, %f1468; mul.f32 %f1470, %f1469, %f846; setp.ltu.f32 %p112, %f1470, %f848; @%p112 bra $L__BB0_77; add.u64 %rd1114, %SPL, 96; setp.nan.f32 %p113, %f4819, %f4819; mov.b32 %r451, %f4819; setp.lt.s32 %p114, %r451, 0; selp.f32 %f1471, 0fBF800000, 0f3F800000, %p114; selp.f32 %f1472, 0f7FC00000, %f1471, %p113; mul.f32 %f1473, %f848, %f1472; mov.b32 %r452, %f4818; setp.lt.s32 %p115, %r452, 0; selp.f32 %f1474, 0fBF800000, 0f3F800000, %p115; setp.nan.f32 %p116, %f4818, %f4818; selp.f32 %f1475, 0f7FC00000, %f1474, %p116; mul.f32 %f1476, %f848, %f1475; div.rn.f32 %f1477, %f1476, %f846; mov.b32 %r453, %f1477; div.rn.f32 %f1478, %f1473, %f846; mov.b32 %r454, %f1478; st.local.v2.f32 [%rd1114], {%f1478, %f1477}; mov.b64 %rd1137, {%r454, %r453}; $L__BB0_80: cvt.u32.u64 %r455, %rd1137; mov.b32 %f1479, %r455; shr.u64 %rd571, %rd1137, 32; cvt.u32.u64 %r456, %rd571; mov.b32 %f1480, %r456; fma.rn.f32 %f79, %f1479, %f846, %f2; fma.rn.f32 %f80, %f1480, %f846, %f3; setp.eq.s32 %p117, %r10, 2; @%p117 bra $L__BB0_82; bra.uni $L__BB0_81; $L__BB0_82: mul.f32 %f1494, %f70, %f846; fma.rn.f32 %f4822, %f1494, %f7, %f7; mov.u64 %rd1138, %rd77; bra.uni $L__BB0_83; $L__BB0_81: mul.f32 %f1481, %f4866, %f846; mul.f32 %f1482, %f4867, %f846; mul.f32 %f1483, %f4868, %f846; mul.f32 %f1484, %f1483, %f4864; fma.rn.f32 %f1485, %f1481, %f7, %f1484; mul.f32 %f1486, %f4869, %f846; mul.f32 %f1487, %f1486, %f4864; fma.rn.f32 %f1488, %f1482, %f7, %f1487; mul.f32 %f1489, %f1483, %f10; fma.rn.f32 %f1490, %f1481, %f4863, %f1489; mul.f32 %f1491, %f1486, %f10; fma.rn.f32 %f1492, %f1482, %f4863, %f1491; add.f32 %f4864, %f4864, %f1488; add.f32 %f1493, %f7, %f1485; st.local.v2.f32 [%rd77], {%f1493, %f4864}; add.f32 %f4863, %f1490, %f4863; st.local.f32 [%rd77+8], %f4863; add.f32 %f4822, %f1492, %f10; add.s64 %rd1138, %rd77, 12; $L__BB0_83: st.local.f32 [%rd1138], %f4822; ld.global.u32 %r11, [%rd81+32]; setp.eq.s32 %p118, %r11, 5; add.s64 %rd111, %rd5, 24; @%p118 bra $L__BB0_297; bra.uni $L__BB0_84; $L__BB0_297: setp.eq.s16 %p414, %rs10, 0; @%p414 bra $L__BB0_299; add.u64 %rd1096, %SPL, 96; add.u64 %rd1090, %SPL, 0; mov.f32 %f4866, 0f00000000; st.local.v2.f32 [%rd1096], {%f4866, %f4866}; st.local.v4.f32 [%rd1090], {%f4866, %f4866, %f4866, %f4866}; mov.f32 %f4867, %f4866; mov.f32 %f4868, %f4866; mov.f32 %f4869, %f4866; $L__BB0_299: ld.local.f32 %f4944, [%rd77+12]; ld.local.f32 %f4875, [%rd77]; mul.f32 %f2980, %f4875, %f4944; mul.f32 %f379, %f4863, %f4864; sub.f32 %f380, %f2980, %f379; div.rn.f32 %f381, %f4, %f5; div.rn.f32 %f2981, %f381, %f380; setp.eq.f32 %p415, %f2981, 0f00000000; setp.ne.s16 %p416, %rs9, 0; or.pred %p417, %p416, %p415; @%p417 bra $L__BB0_509; bra.uni $L__BB0_300; $L__BB0_509: mov.u64 %rd906, 0; st.local.v2.u64 [%rd1], {%rd906, %rd906}; mov.u32 %r1602, 1065353216; st.local.u32 [%rd1], %r1602; st.local.u32 [%rd1+12], %r1602; ld.local.v2.u64 {%rd907, %rd908}, [%rd1]; mov.b64 {%r1603, %r1604}, %rd908; mov.b64 {%r1605, %r1606}, %rd907; st.local.v2.u64 [%rd77], {%rd907, %rd908}; mov.b32 %f4875, %r1605; mov.b32 %f4864, %r1606; mov.b32 %f4863, %r1603; mov.b32 %f4944, %r1604; mov.u16 %rs37, 1; bra.uni $L__BB0_510; $L__BB0_84: cvt.u16.u32 %rs14, %r11; setp.gt.s16 %p119, %rs14, 2; @%p119 bra $L__BB0_87; setp.eq.s16 %p122, %rs14, 1; @%p122 bra $L__BB0_160; setp.eq.s16 %p123, %rs14, 2; @%p123 bra $L__BB0_116; bra.uni $L__BB0_239; $L__BB0_116: ld.global.u64 %rd602, [%rd81+56]; shl.b64 %rd603, %rd70, 4; add.s64 %rd604, %rd602, %rd603; add.s64 %rd121, %rd604, 4; ld.global.f32 %f119, [%rd81+44]; ld.global.f32 %f120, [%rd81+40]; ld.local.v4.f32 {%f1682, %f1683, %f1684, %f1685}, [%rd77]; add.f32 %f1688, %f1685, %f1682; mul.f32 %f121, %f1688, 0f3F000000; sub.f32 %f1689, %f1682, %f1685; mul.f32 %f1690, %f1689, 0f3F000000; add.f32 %f1693, %f1683, %f1684; mul.f32 %f1694, %f1693, 0f3F000000; sub.f32 %f1695, %f1683, %f1684; mul.f32 %f122, %f1695, 0f3F000000; mul.f32 %f1696, %f122, %f122; fma.rn.f32 %f1697, %f121, %f121, %f1696; sqrt.rn.f32 %f1698, %f1697; mul.f32 %f1699, %f1694, %f1694; fma.rn.f32 %f1700, %f1690, %f1690, %f1699; sqrt.rn.f32 %f1701, %f1700; add.f32 %f123, %f1698, %f1701; sub.f32 %f124, %f1698, %f1701; abs.f32 %f125, %f1690; abs.f32 %f126, %f1694; setp.eq.f32 %p165, %f125, 0f00000000; setp.eq.f32 %p166, %f126, 0f00000000; and.pred %p167, %p165, %p166; mov.b32 %r54, %f1690; mov.b32 %r578, %f1694; and.b32 %r55, %r578, -2147483648; @%p167 bra $L__BB0_120; bra.uni $L__BB0_117; $L__BB0_120: shr.s32 %r583, %r54, 31; and.b32 %r584, %r583, 1078530011; or.b32 %r585, %r584, %r55; mov.b32 %f4827, %r585; bra.uni $L__BB0_121; $L__BB0_300: @%p117 bra $L__BB0_302; abs.f32 %f2982, %f4875; setp.gt.f32 %p419, %f2982, 0f461C4000; @%p419 bra $L__BB0_509; $L__BB0_302: ld.global.u16 %rs7, [%rd81]; mov.f32 %f4876, 0f00000000; setp.eq.s16 %p420, %rs7, 0; @%p420 bra $L__BB0_317; setp.ne.s16 %p421, %rs7, 1; @%p421 bra $L__BB0_319; mov.b32 %f4874, %r8; ld.global.u64 %rd739, [%rd81+24]; shl.b64 %rd740, %rd70, 4; add.s64 %rd741, %rd739, %rd740; ld.f32 %f383, [%rd741+8]; ld.global.f32 %f384, [%rd81+16]; mul.f32 %f2987, %f4863, %f4863; fma.rn.f32 %f385, %f4875, %f4875, %f2987; mul.f32 %f2988, %f4944, %f4944; fma.rn.f32 %f386, %f4864, %f4864, %f2988; mov.f32 %f2989, 0fBF000000; cvt.rzi.f32.f32 %f2990, %f2989; add.f32 %f2991, %f2990, %f2990; mov.f32 %f2992, 0fBF800000; sub.f32 %f2993, %f2992, %f2991; abs.f32 %f387, %f2993; abs.f32 %f388, %f380; setp.lt.f32 %p422, %f388, 0f00800000; mul.f32 %f2994, %f388, 0f4B800000; selp.f32 %f2995, %f2994, %f388, %p422; selp.f32 %f2996, 0fC1C00000, 0f00000000, %p422; mov.b32 %r1129, %f2995; add.s32 %r1130, %r1129, -1060439283; and.b32 %r1131, %r1130, -8388608; sub.s32 %r1132, %r1129, %r1131; mov.b32 %f2997, %r1132; cvt.rn.f32.s32 %f2998, %r1131; mov.f32 %f2999, 0f34000000; fma.rn.f32 %f3000, %f2998, %f2999, %f2996; add.f32 %f3001, %f2997, 0fBF800000; add.f32 %f2985, %f2997, 0f3F800000; mov.f32 %f4870, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f2984,%f2985; // end inline asm add.f32 %f3002, %f3001, %f3001; mul.f32 %f3003, %f2984, %f3002; mul.f32 %f3004, %f3003, %f3003; sub.f32 %f3005, %f3001, %f3003; add.f32 %f3006, %f3005, %f3005; neg.f32 %f3007, %f3003; fma.rn.f32 %f3008, %f3007, %f3001, %f3006; mul.rn.f32 %f3009, %f2984, %f3008; mov.f32 %f3010, 0f3B52E7DB; mov.f32 %f3011, 0f3A2C32E4; fma.rn.f32 %f3012, %f3011, %f3004, %f3010; mov.f32 %f3013, 0f3C93BB73; fma.rn.f32 %f3014, %f3012, %f3004, %f3013; mov.f32 %f3015, 0f3DF6384F; fma.rn.f32 %f3016, %f3014, %f3004, %f3015; mul.rn.f32 %f3017, %f3016, %f3004; mov.f32 %f3018, 0f3FB8AA3B; fma.rn.f32 %f3019, %f3003, %f3018, %f3000; sub.f32 %f3020, %f3000, %f3019; fma.rn.f32 %f3021, %f3003, %f3018, %f3020; fma.rn.f32 %f3022, %f3009, %f3018, %f3021; mov.f32 %f3023, 0f32A55E34; fma.rn.f32 %f3024, %f3003, %f3023, %f3022; mul.f32 %f3025, %f3017, 0f40400000; fma.rn.f32 %f3026, %f3025, %f3009, %f3024; fma.rn.f32 %f3027, %f3017, %f3003, %f3026; add.rn.f32 %f3028, %f3019, %f3027; neg.f32 %f3029, %f3019; add.rn.f32 %f3030, %f3028, %f3029; neg.f32 %f3031, %f3030; add.rn.f32 %f3032, %f3027, %f3031; mul.rn.f32 %f3033, %f3028, %f2992; neg.f32 %f3034, %f3033; fma.rn.f32 %f3035, %f3028, %f2992, %f3034; fma.rn.f32 %f3036, %f3032, %f2992, %f3035; cvt.rni.f32.f32 %f3037, %f3033; sub.f32 %f3038, %f3033, %f3037; add.f32 %f3039, %f3036, %f3038; mov.f32 %f3040, 0f3AAF85ED; mov.f32 %f3041, 0f391FCB8E; fma.rn.f32 %f3042, %f3041, %f3039, %f3040; mov.f32 %f3043, 0f3C1D9856; fma.rn.f32 %f3044, %f3042, %f3039, %f3043; mov.f32 %f3045, 0f3D6357BB; fma.rn.f32 %f3046, %f3044, %f3039, %f3045; mov.f32 %f3047, 0f3E75FDEC; fma.rn.f32 %f3048, %f3046, %f3039, %f3047; mov.f32 %f3049, 0f3F317218; fma.rn.f32 %f3050, %f3048, %f3039, %f3049; fma.rn.f32 %f3051, %f3050, %f3039, %f4870; cvt.rzi.s32.f32 %r1133, %f3037; setp.gt.f32 %p423, %f3037, 0f00000000; selp.b32 %r1134, 0, -2097152000, %p423; add.s32 %r1135, %r1134, 2130706432; mov.b32 %f3052, %r1135; mul.f32 %f3053, %f3051, %f3052; shl.b32 %r1136, %r1133, 23; sub.s32 %r1137, %r1136, %r1134; mov.b32 %f3054, %r1137; mul.f32 %f3055, %f3053, %f3054; abs.f32 %f3056, %f3033; setp.gt.f32 %p424, %f3056, 0f43180000; setp.lt.f32 %p425, %f3033, 0f00000000; selp.f32 %f3057, 0f00000000, 0f7F800000, %p425; selp.f32 %f389, %f3057, %f3055, %p424; setp.eq.f32 %p426, %f380, 0f3F800000; @%p426 bra $L__BB0_311; setp.gtu.f32 %p427, %f388, 0f7F800000; @%p427 bra $L__BB0_310; bra.uni $L__BB0_306; $L__BB0_310: mov.f32 %f3060, 0fBF800000; add.rn.f32 %f4870, %f380, %f3060; bra.uni $L__BB0_311; $L__BB0_87: setp.eq.s16 %p120, %rs14, 4; @%p120 bra $L__BB0_297; setp.ne.s16 %p121, %rs14, 3; @%p121 bra $L__BB0_239; ld.global.u64 %rd572, [%rd81+56]; shl.b64 %rd573, %rd70, 4; add.s64 %rd574, %rd572, %rd573; add.s64 %rd112, %rd574, 8; ld.local.v4.f32 {%f1495, %f1496, %f1497, %f1498}, [%rd77]; add.f32 %f1501, %f1498, %f1495; mul.f32 %f88, %f1501, 0f3F000000; sub.f32 %f1502, %f1495, %f1498; mul.f32 %f1503, %f1502, 0f3F000000; add.f32 %f1506, %f1496, %f1497; mul.f32 %f1507, %f1506, 0f3F000000; sub.f32 %f1508, %f1496, %f1497; mul.f32 %f89, %f1508, 0f3F000000; mul.f32 %f1509, %f89, %f89; fma.rn.f32 %f1510, %f88, %f88, %f1509; sqrt.rn.f32 %f1511, %f1510; mul.f32 %f1512, %f1507, %f1507; fma.rn.f32 %f1513, %f1503, %f1503, %f1512; sqrt.rn.f32 %f1514, %f1513; add.f32 %f90, %f1511, %f1514; sub.f32 %f91, %f1511, %f1514; abs.f32 %f92, %f1503; abs.f32 %f93, %f1507; setp.eq.f32 %p124, %f92, 0f00000000; setp.eq.f32 %p125, %f93, 0f00000000; and.pred %p126, %p124, %p125; mov.b32 %r12, %f1503; mov.b32 %r457, %f1507; and.b32 %r13, %r457, -2147483648; @%p126 bra $L__BB0_93; bra.uni $L__BB0_90; $L__BB0_93: shr.s32 %r462, %r12, 31; and.b32 %r463, %r462, 1078530011; or.b32 %r464, %r463, %r13; mov.b32 %f4823, %r464; bra.uni $L__BB0_94; $L__BB0_160: ld.global.u64 %rd643, [%rd81+64]; shl.b64 %rd644, %rd70, 4; add.s64 %rd144, %rd643, %rd644; ld.f32 %f4851, [%rd144]; ld.global.f32 %f175, [%rd81+52]; ld.global.f32 %f176, [%rd81+56]; ld.global.f32 %f177, [%rd81+60]; ld.local.v2.u64 {%rd1157, %rd1158}, [%rd77]; mov.b64 {%r714, %r715}, %rd1158; mov.b64 {%r716, %r717}, %rd1157; mov.b32 %f1942, %r716; mov.b32 %f1943, %r715; add.f32 %f1944, %f1943, %f1942; mul.f32 %f178, %f1944, 0f3F000000; sub.f32 %f1945, %f1942, %f1943; mul.f32 %f1946, %f1945, 0f3F000000; mov.b32 %f1947, %r717; mov.b32 %f1948, %r714; add.f32 %f1949, %f1947, %f1948; mul.f32 %f1950, %f1949, 0f3F000000; sub.f32 %f1951, %f1947, %f1948; mul.f32 %f179, %f1951, 0f3F000000; mul.f32 %f1952, %f179, %f179; fma.rn.f32 %f1953, %f178, %f178, %f1952; sqrt.rn.f32 %f1954, %f1953; mul.f32 %f1955, %f1950, %f1950; fma.rn.f32 %f1956, %f1946, %f1946, %f1955; sqrt.rn.f32 %f1957, %f1956; add.f32 %f180, %f1954, %f1957; sub.f32 %f1958, %f1954, %f1957; setp.lt.f32 %p227, %f1958, 0f00000000; selp.f32 %f181, 0fBF800000, 0f3F800000, %p227; mul.f32 %f182, %f1958, %f181; abs.f32 %f183, %f1946; abs.f32 %f184, %f1950; setp.eq.f32 %p228, %f183, 0f00000000; setp.eq.f32 %p229, %f184, 0f00000000; and.pred %p230, %p228, %p229; mov.b32 %r96, %f1946; mov.b32 %r718, %f1950; and.b32 %r97, %r718, -2147483648; @%p230 bra $L__BB0_164; bra.uni $L__BB0_161; $L__BB0_164: shr.s32 %r723, %r96, 31; and.b32 %r724, %r723, 1078530011; or.b32 %r725, %r724, %r97; mov.b32 %f4835, %r725; bra.uni $L__BB0_165; $L__BB0_239: mov.b32 %f2591, %r8; ld.global.u64 %rd670, [%rd81+72]; shl.b64 %rd671, %rd70, 4; add.s64 %rd672, %rd670, %rd671; add.s64 %rd167, %rd672, 4; ld.global.u8 %rs27, [%rd81+64]; setp.ne.s16 %p339, %rs27, 0; setp.neu.f32 %p340, %f2591, 0f00000000; and.pred %p341, %p340, %p339; @%p341 bra $L__BB0_297; ld.local.v4.f32 {%f2592, %f2593, %f2594, %f2595}, [%rd77]; add.f32 %f2598, %f2595, %f2592; mul.f32 %f290, %f2598, 0f3F000000; sub.f32 %f2599, %f2592, %f2595; mul.f32 %f2600, %f2599, 0f3F000000; add.f32 %f2603, %f2593, %f2594; mul.f32 %f2604, %f2603, 0f3F000000; sub.f32 %f2605, %f2593, %f2594; mul.f32 %f291, %f2605, 0f3F000000; mul.f32 %f2606, %f291, %f291; fma.rn.f32 %f2607, %f290, %f290, %f2606; sqrt.rn.f32 %f2608, %f2607; mul.f32 %f2609, %f2604, %f2604; fma.rn.f32 %f2610, %f2600, %f2600, %f2609; sqrt.rn.f32 %f2611, %f2610; add.f32 %f292, %f2608, %f2611; sub.f32 %f2612, %f2608, %f2611; setp.lt.f32 %p342, %f2612, 0f00000000; selp.f32 %f293, 0fBF800000, 0f3F800000, %p342; mul.f32 %f294, %f2612, %f293; abs.f32 %f295, %f2600; abs.f32 %f296, %f2604; setp.eq.f32 %p343, %f295, 0f00000000; setp.eq.f32 %p344, %f296, 0f00000000; and.pred %p345, %p343, %p344; mov.b32 %r138, %f2600; mov.b32 %r935, %f2604; and.b32 %r139, %r935, -2147483648; @%p345 bra $L__BB0_244; bra.uni $L__BB0_241; $L__BB0_244: shr.s32 %r940, %r138, 31; and.b32 %r941, %r940, 1078530011; or.b32 %r942, %r941, %r139; mov.b32 %f4852, %r942; bra.uni $L__BB0_245; $L__BB0_317: ld.global.u64 %rd742, [%rd81+24]; shl.b64 %rd743, %rd70, 4; add.s64 %rd744, %rd742, %rd743; ld.f32 %f406, [%rd744+8]; ld.local.v4.f32 {%f4875, %f3106, %f3107, %f3108}, [%rd77]; mul.f32 %f3112, %f3108, %f4875; mul.f32 %f3113, %f3107, %f3106; sub.f32 %f408, %f3112, %f3113; add.f32 %f3114, %f3108, %f4875; mul.f32 %f3115, %f3114, 0f3F000000; sub.f32 %f3116, %f4875, %f3108; mul.f32 %f3117, %f3116, 0f3F000000; add.f32 %f3118, %f3106, %f3107; mul.f32 %f3119, %f3118, 0f3F000000; sub.f32 %f3120, %f3106, %f3107; mul.f32 %f3121, %f3120, 0f3F000000; mul.f32 %f3122, %f3121, %f3121; fma.rn.f32 %f3123, %f3115, %f3115, %f3122; sqrt.rn.f32 %f3124, %f3123; mul.f32 %f3125, %f3119, %f3119; fma.rn.f32 %f3126, %f3117, %f3117, %f3125; sqrt.rn.f32 %f3127, %f3126; add.f32 %f3128, %f3124, %f3127; sub.f32 %f3129, %f3124, %f3127; setp.lt.f32 %p438, %f3129, 0f00000000; mov.f32 %f3130, 0f00000000; selp.f32 %f3131, 0fBF800000, 0f3F800000, %p438; add.f32 %f3132, %f3128, 0fBF800000; max.f32 %f3133, %f3132, %f3130; fma.rn.f32 %f3134, %f3129, %f3131, 0fBF800000; max.f32 %f3135, %f3134, %f3130; ld.global.f32 %f3136, [%rd81+20]; mul.f32 %f3137, %f406, %f3136; mul.f32 %f3138, %f3135, %f3135; fma.rn.f32 %f3139, %f3133, %f3133, %f3138; add.f32 %f3140, %f3139, 0f00000000; mul.f32 %f4876, %f3137, %f3140; setp.lt.f32 %p439, %f408, 0f3F800000; @%p439 bra $L__BB0_319; add.f32 %f3141, %f408, 0fBF800000; ld.global.f32 %f3142, [%rd81+16]; mul.f32 %f3143, %f406, %f3142; mul.f32 %f3144, %f3143, 0f3F000000; mul.f32 %f3145, %f3141, %f3144; fma.rn.f32 %f4876, %f3141, %f3145, %f4876; bra.uni $L__BB0_319; $L__BB0_161: setp.eq.f32 %p231, %f183, 0f7F800000; setp.eq.f32 %p232, %f184, 0f7F800000; and.pred %p233, %p231, %p232; @%p233 bra $L__BB0_163; bra.uni $L__BB0_162; $L__BB0_163: setp.lt.s32 %p237, %r96, 0; selp.b32 %r721, 1075235812, 1061752795, %p237; or.b32 %r722, %r721, %r97; mov.b32 %f4835, %r722; bra.uni $L__BB0_165; $L__BB0_117: setp.eq.f32 %p168, %f125, 0f7F800000; setp.eq.f32 %p169, %f126, 0f7F800000; and.pred %p170, %p168, %p169; @%p170 bra $L__BB0_119; bra.uni $L__BB0_118; $L__BB0_119: setp.lt.s32 %p174, %r54, 0; selp.b32 %r581, 1075235812, 1061752795, %p174; or.b32 %r582, %r581, %r55; mov.b32 %f4827, %r582; bra.uni $L__BB0_121; $L__BB0_90: setp.eq.f32 %p127, %f92, 0f7F800000; setp.eq.f32 %p128, %f93, 0f7F800000; and.pred %p129, %p127, %p128; @%p129 bra $L__BB0_92; bra.uni $L__BB0_91; $L__BB0_92: setp.lt.s32 %p133, %r12, 0; selp.b32 %r460, 1075235812, 1061752795, %p133; or.b32 %r461, %r460, %r13; mov.b32 %f4823, %r461; bra.uni $L__BB0_94; $L__BB0_162: setp.lt.s32 %p234, %r96, 0; min.f32 %f1959, %f184, %f183; max.f32 %f1960, %f184, %f183; div.rn.f32 %f1961, %f1959, %f1960; mul.rn.f32 %f1962, %f1961, %f1961; mov.f32 %f1963, 0fC0B59883; mov.f32 %f1964, 0fBF52C7EA; fma.rn.f32 %f1965, %f1962, %f1964, %f1963; mov.f32 %f1966, 0fC0D21907; fma.rn.f32 %f1967, %f1965, %f1962, %f1966; mul.f32 %f1968, %f1962, %f1967; mul.f32 %f1969, %f1961, %f1968; add.f32 %f1970, %f1962, 0f41355DC0; mov.f32 %f1971, 0f41E6BD60; fma.rn.f32 %f1972, %f1970, %f1962, %f1971; mov.f32 %f1973, 0f419D92C8; fma.rn.f32 %f1974, %f1972, %f1962, %f1973; rcp.rn.f32 %f1975, %f1974; fma.rn.f32 %f1976, %f1969, %f1975, %f1961; mov.f32 %f1977, 0f3FC90FDB; sub.f32 %f1978, %f1977, %f1976; setp.gt.f32 %p235, %f184, %f183; selp.f32 %f1979, %f1978, %f1976, %p235; mov.f32 %f1980, 0f40490FDB; sub.f32 %f1981, %f1980, %f1979; selp.f32 %f1982, %f1981, %f1979, %p234; mov.b32 %r719, %f1982; or.b32 %r720, %r97, %r719; mov.b32 %f1983, %r720; add.f32 %f1984, %f183, %f184; setp.le.f32 %p236, %f1984, 0f7F800000; selp.f32 %f4835, %f1983, %f1984, %p236; $L__BB0_165: abs.f32 %f189, %f178; setp.eq.f32 %p238, %f189, 0f00000000; abs.f32 %f190, %f179; setp.eq.f32 %p239, %f190, 0f00000000; and.pred %p240, %p238, %p239; mov.b32 %r98, %f178; mov.b32 %r726, %f179; and.b32 %r99, %r726, -2147483648; @%p240 bra $L__BB0_169; bra.uni $L__BB0_166; $L__BB0_169: shr.s32 %r731, %r98, 31; and.b32 %r732, %r731, 1078530011; or.b32 %r733, %r732, %r99; mov.b32 %f4836, %r733; bra.uni $L__BB0_170; $L__BB0_166: setp.eq.f32 %p241, %f189, 0f7F800000; setp.eq.f32 %p242, %f190, 0f7F800000; and.pred %p243, %p241, %p242; @%p243 bra $L__BB0_168; bra.uni $L__BB0_167; $L__BB0_168: setp.lt.s32 %p247, %r98, 0; selp.b32 %r729, 1075235812, 1061752795, %p247; or.b32 %r730, %r729, %r99; mov.b32 %f4836, %r730; bra.uni $L__BB0_170; $L__BB0_167: setp.lt.s32 %p244, %r98, 0; min.f32 %f1985, %f190, %f189; max.f32 %f1986, %f190, %f189; div.rn.f32 %f1987, %f1985, %f1986; mul.rn.f32 %f1988, %f1987, %f1987; mov.f32 %f1989, 0fC0B59883; mov.f32 %f1990, 0fBF52C7EA; fma.rn.f32 %f1991, %f1988, %f1990, %f1989; mov.f32 %f1992, 0fC0D21907; fma.rn.f32 %f1993, %f1991, %f1988, %f1992; mul.f32 %f1994, %f1988, %f1993; mul.f32 %f1995, %f1987, %f1994; add.f32 %f1996, %f1988, 0f41355DC0; mov.f32 %f1997, 0f41E6BD60; fma.rn.f32 %f1998, %f1996, %f1988, %f1997; mov.f32 %f1999, 0f419D92C8; fma.rn.f32 %f2000, %f1998, %f1988, %f1999; rcp.rn.f32 %f2001, %f2000; fma.rn.f32 %f2002, %f1995, %f2001, %f1987; mov.f32 %f2003, 0f3FC90FDB; sub.f32 %f2004, %f2003, %f2002; setp.gt.f32 %p245, %f190, %f189; selp.f32 %f2005, %f2004, %f2002, %p245; mov.f32 %f2006, 0f40490FDB; sub.f32 %f2007, %f2006, %f2005; selp.f32 %f2008, %f2007, %f2005, %p244; mov.b32 %r727, %f2008; or.b32 %r728, %r99, %r727; mov.b32 %f2009, %r728; add.f32 %f2010, %f189, %f190; setp.le.f32 %p246, %f2010, 0f7F800000; selp.f32 %f4836, %f2009, %f2010, %p246; $L__BB0_170: sub.f32 %f2011, %f4836, %f4835; mul.f32 %f195, %f2011, 0f3F000000; add.f32 %f2012, %f4835, %f4836; mul.f32 %f196, %f2012, 0f3F000000; mul.f32 %f2013, %f195, 0f3F22F983; cvt.rni.s32.f32 %r1659, %f2013; cvt.rn.f32.s32 %f2014, %r1659; mov.f32 %f2015, 0fBFC90FDA; fma.rn.f32 %f2016, %f2014, %f2015, %f195; mov.f32 %f2017, 0fB3A22168; fma.rn.f32 %f2018, %f2014, %f2017, %f2016; mov.f32 %f2019, 0fA7C234C5; fma.rn.f32 %f4837, %f2014, %f2019, %f2018; abs.f32 %f198, %f195; setp.ltu.f32 %p248, %f198, 0f47CE4780; @%p248 bra $L__BB0_178; setp.eq.f32 %p249, %f198, 0f7F800000; @%p249 bra $L__BB0_177; bra.uni $L__BB0_172; $L__BB0_177: mov.f32 %f2022, 0f00000000; mul.rn.f32 %f4837, %f195, %f2022; mov.u32 %r1659, 0; bra.uni $L__BB0_178; $L__BB0_118: setp.lt.s32 %p171, %r54, 0; min.f32 %f1702, %f126, %f125; max.f32 %f1703, %f126, %f125; div.rn.f32 %f1704, %f1702, %f1703; mul.rn.f32 %f1705, %f1704, %f1704; mov.f32 %f1706, 0fC0B59883; mov.f32 %f1707, 0fBF52C7EA; fma.rn.f32 %f1708, %f1705, %f1707, %f1706; mov.f32 %f1709, 0fC0D21907; fma.rn.f32 %f1710, %f1708, %f1705, %f1709; mul.f32 %f1711, %f1705, %f1710; mul.f32 %f1712, %f1704, %f1711; add.f32 %f1713, %f1705, 0f41355DC0; mov.f32 %f1714, 0f41E6BD60; fma.rn.f32 %f1715, %f1713, %f1705, %f1714; mov.f32 %f1716, 0f419D92C8; fma.rn.f32 %f1717, %f1715, %f1705, %f1716; rcp.rn.f32 %f1718, %f1717; fma.rn.f32 %f1719, %f1712, %f1718, %f1704; mov.f32 %f1720, 0f3FC90FDB; sub.f32 %f1721, %f1720, %f1719; setp.gt.f32 %p172, %f126, %f125; selp.f32 %f1722, %f1721, %f1719, %p172; mov.f32 %f1723, 0f40490FDB; sub.f32 %f1724, %f1723, %f1722; selp.f32 %f1725, %f1724, %f1722, %p171; mov.b32 %r579, %f1725; or.b32 %r580, %r55, %r579; mov.b32 %f1726, %r580; add.f32 %f1727, %f125, %f126; setp.le.f32 %p173, %f1727, 0f7F800000; selp.f32 %f4827, %f1726, %f1727, %p173; $L__BB0_121: abs.f32 %f131, %f121; setp.eq.f32 %p175, %f131, 0f00000000; abs.f32 %f132, %f122; setp.eq.f32 %p176, %f132, 0f00000000; and.pred %p177, %p175, %p176; mov.b32 %r56, %f121; mov.b32 %r586, %f122; and.b32 %r57, %r586, -2147483648; @%p177 bra $L__BB0_125; bra.uni $L__BB0_122; $L__BB0_125: shr.s32 %r591, %r56, 31; and.b32 %r592, %r591, 1078530011; or.b32 %r593, %r592, %r57; mov.b32 %f4828, %r593; bra.uni $L__BB0_126; $L__BB0_122: setp.eq.f32 %p178, %f131, 0f7F800000; setp.eq.f32 %p179, %f132, 0f7F800000; and.pred %p180, %p178, %p179; @%p180 bra $L__BB0_124; bra.uni $L__BB0_123; $L__BB0_124: setp.lt.s32 %p184, %r56, 0; selp.b32 %r589, 1075235812, 1061752795, %p184; or.b32 %r590, %r589, %r57; mov.b32 %f4828, %r590; bra.uni $L__BB0_126; $L__BB0_91: setp.lt.s32 %p130, %r12, 0; min.f32 %f1515, %f93, %f92; max.f32 %f1516, %f93, %f92; div.rn.f32 %f1517, %f1515, %f1516; mul.rn.f32 %f1518, %f1517, %f1517; mov.f32 %f1519, 0fC0B59883; mov.f32 %f1520, 0fBF52C7EA; fma.rn.f32 %f1521, %f1518, %f1520, %f1519; mov.f32 %f1522, 0fC0D21907; fma.rn.f32 %f1523, %f1521, %f1518, %f1522; mul.f32 %f1524, %f1518, %f1523; mul.f32 %f1525, %f1517, %f1524; add.f32 %f1526, %f1518, 0f41355DC0; mov.f32 %f1527, 0f41E6BD60; fma.rn.f32 %f1528, %f1526, %f1518, %f1527; mov.f32 %f1529, 0f419D92C8; fma.rn.f32 %f1530, %f1528, %f1518, %f1529; rcp.rn.f32 %f1531, %f1530; fma.rn.f32 %f1532, %f1525, %f1531, %f1517; mov.f32 %f1533, 0f3FC90FDB; sub.f32 %f1534, %f1533, %f1532; setp.gt.f32 %p131, %f93, %f92; selp.f32 %f1535, %f1534, %f1532, %p131; mov.f32 %f1536, 0f40490FDB; sub.f32 %f1537, %f1536, %f1535; selp.f32 %f1538, %f1537, %f1535, %p130; mov.b32 %r458, %f1538; or.b32 %r459, %r13, %r458; mov.b32 %f1539, %r459; add.f32 %f1540, %f92, %f93; setp.le.f32 %p132, %f1540, 0f7F800000; selp.f32 %f4823, %f1539, %f1540, %p132; $L__BB0_94: abs.f32 %f98, %f88; setp.eq.f32 %p134, %f98, 0f00000000; abs.f32 %f99, %f89; setp.eq.f32 %p135, %f99, 0f00000000; and.pred %p136, %p134, %p135; mov.b32 %r14, %f88; mov.b32 %r465, %f89; and.b32 %r15, %r465, -2147483648; @%p136 bra $L__BB0_98; bra.uni $L__BB0_95; $L__BB0_98: shr.s32 %r470, %r14, 31; and.b32 %r471, %r470, 1078530011; or.b32 %r472, %r471, %r15; mov.b32 %f4824, %r472; bra.uni $L__BB0_99; $L__BB0_95: setp.eq.f32 %p137, %f98, 0f7F800000; setp.eq.f32 %p138, %f99, 0f7F800000; and.pred %p139, %p137, %p138; @%p139 bra $L__BB0_97; bra.uni $L__BB0_96; $L__BB0_97: setp.lt.s32 %p143, %r14, 0; selp.b32 %r468, 1075235812, 1061752795, %p143; or.b32 %r469, %r468, %r15; mov.b32 %f4824, %r469; bra.uni $L__BB0_99; $L__BB0_123: setp.lt.s32 %p181, %r56, 0; min.f32 %f1728, %f132, %f131; max.f32 %f1729, %f132, %f131; div.rn.f32 %f1730, %f1728, %f1729; mul.rn.f32 %f1731, %f1730, %f1730; mov.f32 %f1732, 0fC0B59883; mov.f32 %f1733, 0fBF52C7EA; fma.rn.f32 %f1734, %f1731, %f1733, %f1732; mov.f32 %f1735, 0fC0D21907; fma.rn.f32 %f1736, %f1734, %f1731, %f1735; mul.f32 %f1737, %f1731, %f1736; mul.f32 %f1738, %f1730, %f1737; add.f32 %f1739, %f1731, 0f41355DC0; mov.f32 %f1740, 0f41E6BD60; fma.rn.f32 %f1741, %f1739, %f1731, %f1740; mov.f32 %f1742, 0f419D92C8; fma.rn.f32 %f1743, %f1741, %f1731, %f1742; rcp.rn.f32 %f1744, %f1743; fma.rn.f32 %f1745, %f1738, %f1744, %f1730; mov.f32 %f1746, 0f3FC90FDB; sub.f32 %f1747, %f1746, %f1745; setp.gt.f32 %p182, %f132, %f131; selp.f32 %f1748, %f1747, %f1745, %p182; mov.f32 %f1749, 0f40490FDB; sub.f32 %f1750, %f1749, %f1748; selp.f32 %f1751, %f1750, %f1748, %p181; mov.b32 %r587, %f1751; or.b32 %r588, %r57, %r587; mov.b32 %f1752, %r588; add.f32 %f1753, %f131, %f132; setp.le.f32 %p183, %f1753, 0f7F800000; selp.f32 %f4828, %f1752, %f1753, %p183; $L__BB0_126: sub.f32 %f1754, %f4828, %f4827; mul.f32 %f137, %f1754, 0f3F000000; add.f32 %f1755, %f4827, %f4828; mul.f32 %f138, %f1755, 0f3F000000; mul.f32 %f1756, %f137, 0f3F22F983; cvt.rni.s32.f32 %r1649, %f1756; cvt.rn.f32.s32 %f1757, %r1649; mov.f32 %f1758, 0fBFC90FDA; fma.rn.f32 %f1759, %f1757, %f1758, %f137; mov.f32 %f1760, 0fB3A22168; fma.rn.f32 %f1761, %f1757, %f1760, %f1759; mov.f32 %f1762, 0fA7C234C5; fma.rn.f32 %f4829, %f1757, %f1762, %f1761; abs.f32 %f140, %f137; setp.ltu.f32 %p185, %f140, 0f47CE4780; @%p185 bra $L__BB0_134; setp.eq.f32 %p186, %f140, 0f7F800000; @%p186 bra $L__BB0_133; bra.uni $L__BB0_128; $L__BB0_133: mov.f32 %f1765, 0f00000000; mul.rn.f32 %f4829, %f137, %f1765; mov.u32 %r1649, 0; bra.uni $L__BB0_134; $L__BB0_96: setp.lt.s32 %p140, %r14, 0; min.f32 %f1541, %f99, %f98; max.f32 %f1542, %f99, %f98; div.rn.f32 %f1543, %f1541, %f1542; mul.rn.f32 %f1544, %f1543, %f1543; mov.f32 %f1545, 0fC0B59883; mov.f32 %f1546, 0fBF52C7EA; fma.rn.f32 %f1547, %f1544, %f1546, %f1545; mov.f32 %f1548, 0fC0D21907; fma.rn.f32 %f1549, %f1547, %f1544, %f1548; mul.f32 %f1550, %f1544, %f1549; mul.f32 %f1551, %f1543, %f1550; add.f32 %f1552, %f1544, 0f41355DC0; mov.f32 %f1553, 0f41E6BD60; fma.rn.f32 %f1554, %f1552, %f1544, %f1553; mov.f32 %f1555, 0f419D92C8; fma.rn.f32 %f1556, %f1554, %f1544, %f1555; rcp.rn.f32 %f1557, %f1556; fma.rn.f32 %f1558, %f1551, %f1557, %f1543; mov.f32 %f1559, 0f3FC90FDB; sub.f32 %f1560, %f1559, %f1558; setp.gt.f32 %p141, %f99, %f98; selp.f32 %f1561, %f1560, %f1558, %p141; mov.f32 %f1562, 0f40490FDB; sub.f32 %f1563, %f1562, %f1561; selp.f32 %f1564, %f1563, %f1561, %p140; mov.b32 %r466, %f1564; or.b32 %r467, %r15, %r466; mov.b32 %f1565, %r467; add.f32 %f1566, %f98, %f99; setp.le.f32 %p142, %f1566, 0f7F800000; selp.f32 %f4824, %f1565, %f1566, %p142; $L__BB0_99: sub.f32 %f1567, %f4824, %f4823; mul.f32 %f104, %f1567, 0f3F000000; add.f32 %f1568, %f4823, %f4824; mul.f32 %f105, %f1568, 0f3F000000; mul.f32 %f1569, %f104, 0f3F22F983; cvt.rni.s32.f32 %r1639, %f1569; cvt.rn.f32.s32 %f1570, %r1639; mov.f32 %f1571, 0fBFC90FDA; fma.rn.f32 %f1572, %f1570, %f1571, %f104; mov.f32 %f1573, 0fB3A22168; fma.rn.f32 %f1574, %f1570, %f1573, %f1572; mov.f32 %f1575, 0fA7C234C5; fma.rn.f32 %f4825, %f1570, %f1575, %f1574; abs.f32 %f107, %f104; setp.ltu.f32 %p144, %f107, 0f47CE4780; @%p144 bra $L__BB0_107; setp.eq.f32 %p145, %f107, 0f7F800000; @%p145 bra $L__BB0_106; bra.uni $L__BB0_101; $L__BB0_106: mov.f32 %f1578, 0f00000000; mul.rn.f32 %f4825, %f104, %f1578; mov.u32 %r1639, 0; bra.uni $L__BB0_107; $L__BB0_241: setp.eq.f32 %p346, %f295, 0f7F800000; setp.eq.f32 %p347, %f296, 0f7F800000; and.pred %p348, %p346, %p347; @%p348 bra $L__BB0_243; bra.uni $L__BB0_242; $L__BB0_243: setp.lt.s32 %p352, %r138, 0; selp.b32 %r938, 1075235812, 1061752795, %p352; or.b32 %r939, %r938, %r139; mov.b32 %f4852, %r939; bra.uni $L__BB0_245; $L__BB0_172: mov.b32 %r101, %f195; shr.u32 %r736, %r101, 23; and.b32 %r737, %r736, 255; add.s32 %r102, %r737, -128; shl.b32 %r738, %r101, 8; or.b32 %r103, %r738, -2147483648; shr.u32 %r104, %r102, 5; mov.u32 %r1655, 0; mov.u64 %rd1153, __cudart_i2opi_f; mov.u64 %rd1154, %rd5; mov.u32 %r1656, %r1655; $L__BB0_173: .pragma "nounroll"; mov.u32 %r106, %r1656; ld.global.nc.u32 %r741, [%rd1153]; // begin inline asm { mad.lo.cc.u32 %r739, %r741, %r103, %r106; madc.hi.u32 %r1656, %r741, %r103, 0; } // end inline asm st.local.u32 [%rd1154], %r739; add.s64 %rd1154, %rd1154, 4; add.s64 %rd1153, %rd1153, 4; add.s32 %r1655, %r1655, 1; setp.ne.s32 %p250, %r1655, 6; @%p250 bra $L__BB0_173; mov.u32 %r746, -1560706194; // begin inline asm { mad.lo.cc.u32 %r744, %r746, %r103, %r106; madc.hi.u32 %r745, %r746, %r103, 0; } // end inline asm st.local.u32 [%rd111], %r745; mov.u32 %r749, 4; sub.s32 %r109, %r749, %r104; mov.u32 %r750, 6; sub.s32 %r751, %r750, %r104; mul.wide.s32 %rd648, %r751, 4; add.s64 %rd649, %rd5, %rd648; ld.local.u32 %r1657, [%rd649]; ld.local.u32 %r1658, [%rd649+-4]; and.b32 %r112, %r102, 31; setp.eq.s32 %p251, %r112, 0; @%p251 bra $L__BB0_176; mov.u32 %r752, 32; sub.s32 %r753, %r752, %r112; shr.u32 %r754, %r1658, %r753; shl.b32 %r755, %r1657, %r112; add.s32 %r1657, %r754, %r755; mul.wide.s32 %rd650, %r109, 4; add.s64 %rd651, %rd5, %rd650; ld.local.u32 %r756, [%rd651]; shr.u32 %r757, %r756, %r753; shl.b32 %r758, %r1658, %r112; add.s32 %r1658, %r757, %r758; $L__BB0_176: and.b32 %r759, %r101, -2147483648; shr.u32 %r760, %r1658, 30; shl.b32 %r761, %r1657, 2; or.b32 %r762, %r760, %r761; shr.u32 %r763, %r762, 31; shr.u32 %r764, %r1657, 30; add.s32 %r765, %r763, %r764; neg.s32 %r766, %r765; setp.eq.s32 %p252, %r759, 0; selp.b32 %r1659, %r765, %r766, %p252; setp.ne.s32 %p253, %r763, 0; xor.b32 %r767, %r759, -2147483648; selp.b32 %r768, %r767, %r759, %p253; selp.b32 %r769, -1, 0, %p253; xor.b32 %r770, %r762, %r769; shl.b32 %r771, %r1658, 2; xor.b32 %r772, %r771, %r769; cvt.u64.u32 %rd652, %r770; cvt.u64.u32 %rd653, %r772; bfi.b64 %rd654, %rd652, %rd653, 32, 32; cvt.rn.f64.s64 %fd9, %rd654; mul.f64 %fd10, %fd9, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f2020, %fd10; setp.eq.s32 %p254, %r768, 0; neg.f32 %f2021, %f2020; selp.f32 %f4837, %f2020, %f2021, %p254; $L__BB0_178: mul.f32 %f2023, %f196, 0f3F22F983; cvt.rni.s32.f32 %r1664, %f2023; cvt.rn.f32.s32 %f2024, %r1664; fma.rn.f32 %f2026, %f2024, %f2015, %f196; fma.rn.f32 %f2028, %f2024, %f2017, %f2026; fma.rn.f32 %f4838, %f2024, %f2019, %f2028; abs.f32 %f203, %f196; setp.ltu.f32 %p255, %f203, 0f47CE4780; @%p255 bra $L__BB0_186; setp.eq.f32 %p256, %f203, 0f7F800000; @%p256 bra $L__BB0_185; bra.uni $L__BB0_180; $L__BB0_185: mov.f32 %f2032, 0f00000000; mul.rn.f32 %f4838, %f196, %f2032; mov.u32 %r1664, 0; bra.uni $L__BB0_186; $L__BB0_180: mov.b32 %r120, %f196; shr.u32 %r776, %r120, 23; and.b32 %r777, %r776, 255; add.s32 %r121, %r777, -128; shl.b32 %r778, %r120, 8; or.b32 %r122, %r778, -2147483648; shr.u32 %r123, %r121, 5; mov.u32 %r1660, 0; mov.u64 %rd1155, __cudart_i2opi_f; mov.u64 %rd1156, %rd5; mov.u32 %r1661, %r1660; $L__BB0_181: .pragma "nounroll"; mov.u32 %r125, %r1661; ld.global.nc.u32 %r781, [%rd1155]; // begin inline asm { mad.lo.cc.u32 %r779, %r781, %r122, %r125; madc.hi.u32 %r1661, %r781, %r122, 0; } // end inline asm st.local.u32 [%rd1156], %r779; add.s64 %rd1156, %rd1156, 4; add.s64 %rd1155, %rd1155, 4; add.s32 %r1660, %r1660, 1; setp.ne.s32 %p257, %r1660, 6; @%p257 bra $L__BB0_181; mov.u32 %r786, -1560706194; // begin inline asm { mad.lo.cc.u32 %r784, %r786, %r122, %r125; madc.hi.u32 %r785, %r786, %r122, 0; } // end inline asm st.local.u32 [%rd111], %r785; mov.u32 %r789, 4; sub.s32 %r128, %r789, %r123; mov.u32 %r790, 6; sub.s32 %r791, %r790, %r123; mul.wide.s32 %rd656, %r791, 4; add.s64 %rd657, %rd5, %rd656; ld.local.u32 %r1662, [%rd657]; ld.local.u32 %r1663, [%rd657+-4]; and.b32 %r131, %r121, 31; setp.eq.s32 %p258, %r131, 0; @%p258 bra $L__BB0_184; mov.u32 %r792, 32; sub.s32 %r793, %r792, %r131; shr.u32 %r794, %r1663, %r793; shl.b32 %r795, %r1662, %r131; add.s32 %r1662, %r794, %r795; mul.wide.s32 %rd658, %r128, 4; add.s64 %rd659, %rd5, %rd658; ld.local.u32 %r796, [%rd659]; shr.u32 %r797, %r796, %r793; shl.b32 %r798, %r1663, %r131; add.s32 %r1663, %r797, %r798; $L__BB0_184: and.b32 %r799, %r120, -2147483648; shr.u32 %r800, %r1663, 30; shl.b32 %r801, %r1662, 2; or.b32 %r802, %r800, %r801; shr.u32 %r803, %r802, 31; shr.u32 %r804, %r1662, 30; add.s32 %r805, %r803, %r804; neg.s32 %r806, %r805; setp.eq.s32 %p259, %r799, 0; selp.b32 %r1664, %r805, %r806, %p259; setp.ne.s32 %p260, %r803, 0; xor.b32 %r807, %r799, -2147483648; selp.b32 %r808, %r807, %r799, %p260; selp.b32 %r809, -1, 0, %p260; xor.b32 %r810, %r802, %r809; shl.b32 %r811, %r1663, 2; xor.b32 %r812, %r811, %r809; cvt.u64.u32 %rd660, %r810; cvt.u64.u32 %rd661, %r812; bfi.b64 %rd662, %rd660, %rd661, 32, 32; cvt.rn.f64.s64 %fd11, %rd662; mul.f64 %fd12, %fd11, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f2030, %fd12; setp.eq.s32 %p261, %r808, 0; neg.f32 %f2031, %f2030; selp.f32 %f4838, %f2030, %f2031, %p261; $L__BB0_186: mul.f32 %f2033, %f4837, %f4837; mov.f32 %f2034, 0fBAB607ED; mov.f32 %f2035, 0f37CBAC00; fma.rn.f32 %f2036, %f2035, %f2033, %f2034; mov.f32 %f2037, 0f3D2AAABB; fma.rn.f32 %f2038, %f2036, %f2033, %f2037; mov.f32 %f2039, 0fBEFFFFFF; fma.rn.f32 %f2040, %f2038, %f2033, %f2039; mov.f32 %f2041, 0f3F800000; fma.rn.f32 %f2042, %f2040, %f2033, %f2041; mov.f32 %f2043, 0f3C0885E4; mov.f32 %f2044, 0fB94D4153; fma.rn.f32 %f2045, %f2044, %f2033, %f2043; mov.f32 %f2046, 0fBE2AAAA8; fma.rn.f32 %f2047, %f2045, %f2033, %f2046; mov.f32 %f2048, 0f00000000; fma.rn.f32 %f2049, %f2033, %f4837, %f2048; fma.rn.f32 %f2050, %f2047, %f2049, %f4837; and.b32 %r814, %r1659, 1; setp.eq.b32 %p262, %r814, 1; selp.f32 %f2051, %f2042, %f2050, %p262; selp.f32 %f2052, %f2050, %f2042, %p262; neg.f32 %f2053, %f2051; and.b32 %r815, %r1659, 2; setp.eq.s32 %p263, %r815, 0; selp.f32 %f2054, %f2051, %f2053, %p263; neg.f32 %f2055, %f2052; add.s32 %r816, %r1659, 1; and.b32 %r817, %r816, 2; setp.eq.s32 %p264, %r817, 0; selp.f32 %f2056, %f2052, %f2055, %p264; mul.f32 %f2057, %f4838, %f4838; fma.rn.f32 %f2058, %f2035, %f2057, %f2034; fma.rn.f32 %f2059, %f2058, %f2057, %f2037; fma.rn.f32 %f2060, %f2059, %f2057, %f2039; fma.rn.f32 %f2061, %f2060, %f2057, %f2041; fma.rn.f32 %f2062, %f2057, %f4838, %f2048; fma.rn.f32 %f2063, %f2044, %f2057, %f2043; fma.rn.f32 %f2064, %f2063, %f2057, %f2046; fma.rn.f32 %f2065, %f2064, %f2062, %f4838; and.b32 %r818, %r1664, 1; setp.eq.b32 %p265, %r818, 1; selp.f32 %f2066, %f2061, %f2065, %p265; selp.f32 %f2067, %f2065, %f2061, %p265; and.b32 %r819, %r1664, 2; setp.eq.s32 %p266, %r819, 0; neg.f32 %f2068, %f2066; selp.f32 %f2069, %f2066, %f2068, %p266; add.s32 %r820, %r1664, 1; and.b32 %r821, %r820, 2; setp.eq.s32 %p267, %r821, 0; neg.f32 %f2070, %f2067; selp.f32 %f2071, %f2067, %f2070, %p267; mov.b32 %r822, %f2071; neg.f32 %f2072, %f2069; mov.b32 %r823, %f2069; cvt.u64.u32 %rd663, %r823; cvt.u64.u32 %rd664, %r822; bfi.b64 %rd155, %rd663, %rd664, 32, 32; mov.b32 %r824, %f2072; cvt.u64.u32 %rd665, %r824; bfi.b64 %rd156, %rd664, %rd665, 32, 32; mul.f32 %f2073, %f181, %f2054; mov.b32 %r825, %f2073; cvt.u64.u32 %rd666, %r825; mov.b32 %r826, %f2056; cvt.u64.u32 %rd667, %r826; bfi.b64 %rd157, %rd666, %rd667, 32, 32; neg.f32 %f2074, %f2054; mov.b32 %r827, %f2074; mul.f32 %f2075, %f181, %f2056; mov.b32 %r828, %f2075; cvt.u64.u32 %rd668, %r828; cvt.u64.u32 %rd669, %r827; bfi.b64 %rd158, %rd668, %rd669, 32, 32; mul.f32 %f207, %f180, %f180; add.f32 %f2076, %f207, 0f00000000; mul.f32 %f208, %f182, %f182; add.f32 %f209, %f2076, %f208; ld.global.f32 %f210, [%rd81+44]; neg.f32 %f2077, %f4851; max.f32 %f2078, %f2077, %f2048; mul.f32 %f211, %f175, %f2078; abs.f32 %f212, %f211; setp.ltu.f32 %p268, %f212, 0f3F800000; @%p268 bra $L__BB0_188; bra.uni $L__BB0_187; $L__BB0_188: mul.f32 %f2100, %f211, %f211; mov.f32 %f2101, 0f394FFF49; mov.f32 %f2102, 0f363D0ADA; fma.rn.f32 %f2103, %f2102, %f2100, %f2101; mov.f32 %f2104, 0f3C08889A; fma.rn.f32 %f2105, %f2103, %f2100, %f2104; mov.f32 %f2106, 0f3E2AAAAB; fma.rn.f32 %f2107, %f2105, %f2100, %f2106; mul.f32 %f2108, %f2100, %f2107; fma.rn.f32 %f4839, %f2108, %f211, %f211; bra.uni $L__BB0_189; $L__BB0_187: mov.f32 %f2079, 0f3FB8AA3B; mul.rn.f32 %f2080, %f212, %f2079; cvt.rzi.f32.f32 %f2081, %f2080; abs.f32 %f2082, %f2081; setp.gt.f32 %p269, %f2082, 0f42FC0000; mov.b32 %r829, %f2081; and.b32 %r830, %r829, -2147483648; or.b32 %r831, %r830, 1123811328; mov.b32 %f2083, %r831; selp.f32 %f2084, %f2083, %f2081, %p269; mov.f32 %f2085, 0fBF317218; fma.rn.f32 %f2086, %f2084, %f2085, %f212; mov.f32 %f2087, 0f3102E308; fma.rn.f32 %f2088, %f2084, %f2087, %f2086; mul.f32 %f2089, %f2088, 0f3FB8AA3B; add.f32 %f2090, %f2084, 0f4B40007D; mov.b32 %r832, %f2090; shl.b32 %r833, %r832, 23; mov.b32 %f2091, %r833; ex2.approx.ftz.f32 %f2092, %f2089; mul.f32 %f2093, %f2092, %f2091; mov.f32 %f2094, 0f3E000000; div.approx.f32 %f2095, %f2094, %f2093; neg.f32 %f2096, %f2095; mov.f32 %f2097, 0f40000000; fma.rn.f32 %f2098, %f2097, %f2093, %f2096; setp.ge.f32 %p270, %f212, 0f42B40000; selp.f32 %f2099, 0f7F800000, %f2098, %p270; mov.b32 %r834, %f2099; mov.b32 %r835, %f211; and.b32 %r836, %r835, -2147483648; or.b32 %r837, %r836, %r834; mov.b32 %f4839, %r837; $L__BB0_189: add.f32 %f2112, %f4839, 0f3727C5AC; mul.f32 %f216, %f210, %f2112; ld.global.f32 %f217, [%rd81+40]; mov.f32 %f2113, 0fBF000000; cvt.rzi.f32.f32 %f2114, %f2113; add.f32 %f2115, %f2114, %f2114; mov.f32 %f2116, 0fBF800000; sub.f32 %f2117, %f2116, %f2115; abs.f32 %f218, %f2117; mul.f32 %f219, %f180, %f182; abs.f32 %f220, %f219; setp.lt.f32 %p271, %f220, 0f00800000; mul.f32 %f2118, %f220, 0f4B800000; selp.f32 %f2119, %f2118, %f220, %p271; selp.f32 %f2120, 0fC1C00000, 0f00000000, %p271; mov.b32 %r838, %f2119; add.s32 %r839, %r838, -1060439283; and.b32 %r840, %r839, -8388608; sub.s32 %r841, %r838, %r840; mov.b32 %f2121, %r841; cvt.rn.f32.s32 %f2122, %r840; mov.f32 %f2123, 0f34000000; fma.rn.f32 %f2124, %f2122, %f2123, %f2120; add.f32 %f2125, %f2121, 0fBF800000; add.f32 %f2110, %f2121, 0f3F800000; mov.f32 %f4840, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f2109,%f2110; // end inline asm add.f32 %f2126, %f2125, %f2125; mul.f32 %f2127, %f2109, %f2126; mul.f32 %f2128, %f2127, %f2127; sub.f32 %f2129, %f2125, %f2127; add.f32 %f2130, %f2129, %f2129; neg.f32 %f2131, %f2127; fma.rn.f32 %f2132, %f2131, %f2125, %f2130; mul.rn.f32 %f2133, %f2109, %f2132; mov.f32 %f2134, 0f3B52E7DB; mov.f32 %f2135, 0f3A2C32E4; fma.rn.f32 %f2136, %f2135, %f2128, %f2134; mov.f32 %f2137, 0f3C93BB73; fma.rn.f32 %f2138, %f2136, %f2128, %f2137; mov.f32 %f2139, 0f3DF6384F; fma.rn.f32 %f2140, %f2138, %f2128, %f2139; mul.rn.f32 %f2141, %f2140, %f2128; mov.f32 %f2142, 0f3FB8AA3B; fma.rn.f32 %f2143, %f2127, %f2142, %f2124; sub.f32 %f2144, %f2124, %f2143; fma.rn.f32 %f2145, %f2127, %f2142, %f2144; fma.rn.f32 %f2146, %f2133, %f2142, %f2145; mov.f32 %f2147, 0f32A55E34; fma.rn.f32 %f2148, %f2127, %f2147, %f2146; mul.f32 %f2149, %f2141, 0f40400000; fma.rn.f32 %f2150, %f2149, %f2133, %f2148; fma.rn.f32 %f2151, %f2141, %f2127, %f2150; add.rn.f32 %f221, %f2143, %f2151; neg.f32 %f2152, %f2143; add.rn.f32 %f2153, %f221, %f2152; neg.f32 %f2154, %f2153; add.rn.f32 %f222, %f2151, %f2154; mul.rn.f32 %f2155, %f221, %f2116; neg.f32 %f2156, %f2155; fma.rn.f32 %f2157, %f221, %f2116, %f2156; fma.rn.f32 %f2158, %f222, %f2116, %f2157; cvt.rni.f32.f32 %f2159, %f2155; sub.f32 %f2160, %f2155, %f2159; add.f32 %f2161, %f2158, %f2160; mov.f32 %f2162, 0f3AAF85ED; mov.f32 %f2163, 0f391FCB8E; fma.rn.f32 %f2164, %f2163, %f2161, %f2162; mov.f32 %f2165, 0f3C1D9856; fma.rn.f32 %f2166, %f2164, %f2161, %f2165; mov.f32 %f2167, 0f3D6357BB; fma.rn.f32 %f2168, %f2166, %f2161, %f2167; mov.f32 %f2169, 0f3E75FDEC; fma.rn.f32 %f2170, %f2168, %f2161, %f2169; mov.f32 %f2171, 0f3F317218; fma.rn.f32 %f2172, %f2170, %f2161, %f2171; fma.rn.f32 %f2173, %f2172, %f2161, %f4840; cvt.rzi.s32.f32 %r842, %f2159; setp.gt.f32 %p272, %f2159, 0f00000000; selp.b32 %r843, 0, -2097152000, %p272; add.s32 %r844, %r843, 2130706432; mov.b32 %f2174, %r844; mul.f32 %f2175, %f2173, %f2174; shl.b32 %r845, %r842, 23; sub.s32 %r846, %r845, %r843; mov.b32 %f2176, %r846; mul.f32 %f2177, %f2175, %f2176; abs.f32 %f2178, %f2155; setp.gt.f32 %p273, %f2178, 0f43180000; setp.lt.f32 %p274, %f2155, 0f00000000; selp.f32 %f2179, 0f00000000, 0f7F800000, %p274; selp.f32 %f223, %f2179, %f2177, %p273; setp.eq.f32 %p275, %f219, 0f3F800000; @%p275 bra $L__BB0_196; abs.f32 %f4776, %f219; setp.gtu.f32 %p276, %f4776, 0f7F800000; @%p276 bra $L__BB0_195; bra.uni $L__BB0_191; $L__BB0_195: mov.f32 %f2182, 0fBF800000; add.rn.f32 %f4840, %f219, %f2182; bra.uni $L__BB0_196; $L__BB0_191: abs.f32 %f4777, %f219; setp.eq.f32 %p277, %f219, 0f00000000; setp.eq.f32 %p278, %f4777, 0f7F800000; or.pred %p279, %p277, %p278; @%p279 bra $L__BB0_194; bra.uni $L__BB0_192; $L__BB0_194: setp.eq.f32 %p282, %f218, 0f3F800000; add.f32 %f2181, %f219, %f219; mov.b32 %r847, %f2181; xor.b32 %r848, %r847, 2139095040; and.b32 %r849, %r848, 2147483647; selp.b32 %r850, %r848, %r849, %p282; mov.b32 %f4840, %r850; bra.uni $L__BB0_196; $L__BB0_128: mov.b32 %r59, %f137; shr.u32 %r596, %r59, 23; and.b32 %r597, %r596, 255; add.s32 %r60, %r597, -128; shl.b32 %r598, %r59, 8; or.b32 %r61, %r598, -2147483648; shr.u32 %r62, %r60, 5; mov.u32 %r1645, 0; mov.u64 %rd1143, __cudart_i2opi_f; mov.u64 %rd1144, %rd5; mov.u32 %r1646, %r1645; $L__BB0_129: .pragma "nounroll"; mov.u32 %r64, %r1646; ld.global.nc.u32 %r601, [%rd1143]; // begin inline asm { mad.lo.cc.u32 %r599, %r601, %r61, %r64; madc.hi.u32 %r1646, %r601, %r61, 0; } // end inline asm st.local.u32 [%rd1144], %r599; add.s64 %rd1144, %rd1144, 4; add.s64 %rd1143, %rd1143, 4; add.s32 %r1645, %r1645, 1; setp.ne.s32 %p187, %r1645, 6; @%p187 bra $L__BB0_129; mov.u32 %r606, -1560706194; // begin inline asm { mad.lo.cc.u32 %r604, %r606, %r61, %r64; madc.hi.u32 %r605, %r606, %r61, 0; } // end inline asm st.local.u32 [%rd111], %r605; mov.u32 %r609, 4; sub.s32 %r67, %r609, %r62; mov.u32 %r610, 6; sub.s32 %r611, %r610, %r62; mul.wide.s32 %rd606, %r611, 4; add.s64 %rd607, %rd5, %rd606; ld.local.u32 %r1647, [%rd607]; ld.local.u32 %r1648, [%rd607+-4]; and.b32 %r70, %r60, 31; setp.eq.s32 %p188, %r70, 0; @%p188 bra $L__BB0_132; mov.u32 %r612, 32; sub.s32 %r613, %r612, %r70; shr.u32 %r614, %r1648, %r613; shl.b32 %r615, %r1647, %r70; add.s32 %r1647, %r614, %r615; mul.wide.s32 %rd608, %r67, 4; add.s64 %rd609, %rd5, %rd608; ld.local.u32 %r616, [%rd609]; shr.u32 %r617, %r616, %r613; shl.b32 %r618, %r1648, %r70; add.s32 %r1648, %r617, %r618; $L__BB0_132: and.b32 %r619, %r59, -2147483648; shr.u32 %r620, %r1648, 30; shl.b32 %r621, %r1647, 2; or.b32 %r622, %r620, %r621; shr.u32 %r623, %r622, 31; shr.u32 %r624, %r1647, 30; add.s32 %r625, %r623, %r624; neg.s32 %r626, %r625; setp.eq.s32 %p189, %r619, 0; selp.b32 %r1649, %r625, %r626, %p189; setp.ne.s32 %p190, %r623, 0; xor.b32 %r627, %r619, -2147483648; selp.b32 %r628, %r627, %r619, %p190; selp.b32 %r629, -1, 0, %p190; xor.b32 %r630, %r622, %r629; shl.b32 %r631, %r1648, 2; xor.b32 %r632, %r631, %r629; cvt.u64.u32 %rd610, %r630; cvt.u64.u32 %rd611, %r632; bfi.b64 %rd612, %rd610, %rd611, 32, 32; cvt.rn.f64.s64 %fd5, %rd612; mul.f64 %fd6, %fd5, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f1763, %fd6; setp.eq.s32 %p191, %r628, 0; neg.f32 %f1764, %f1763; selp.f32 %f4829, %f1763, %f1764, %p191; $L__BB0_134: mul.f32 %f1766, %f138, 0f3F22F983; cvt.rni.s32.f32 %r1654, %f1766; cvt.rn.f32.s32 %f1767, %r1654; fma.rn.f32 %f1769, %f1767, %f1758, %f138; fma.rn.f32 %f1771, %f1767, %f1760, %f1769; fma.rn.f32 %f4830, %f1767, %f1762, %f1771; abs.f32 %f145, %f138; setp.ltu.f32 %p192, %f145, 0f47CE4780; @%p192 bra $L__BB0_142; setp.eq.f32 %p193, %f145, 0f7F800000; @%p193 bra $L__BB0_141; bra.uni $L__BB0_136; $L__BB0_141: mov.f32 %f1775, 0f00000000; mul.rn.f32 %f4830, %f138, %f1775; mov.u32 %r1654, 0; bra.uni $L__BB0_142; $L__BB0_101: mov.b32 %r17, %f104; shr.u32 %r475, %r17, 23; and.b32 %r476, %r475, 255; add.s32 %r18, %r476, -128; shl.b32 %r477, %r17, 8; or.b32 %r19, %r477, -2147483648; shr.u32 %r20, %r18, 5; mov.u32 %r1635, 0; mov.u64 %rd1139, __cudart_i2opi_f; mov.u64 %rd1140, %rd5; mov.u32 %r1636, %r1635; $L__BB0_102: .pragma "nounroll"; mov.u32 %r22, %r1636; ld.global.nc.u32 %r480, [%rd1139]; // begin inline asm { mad.lo.cc.u32 %r478, %r480, %r19, %r22; madc.hi.u32 %r1636, %r480, %r19, 0; } // end inline asm st.local.u32 [%rd1140], %r478; add.s64 %rd1140, %rd1140, 4; add.s64 %rd1139, %rd1139, 4; add.s32 %r1635, %r1635, 1; setp.ne.s32 %p146, %r1635, 6; @%p146 bra $L__BB0_102; mov.u32 %r485, -1560706194; // begin inline asm { mad.lo.cc.u32 %r483, %r485, %r19, %r22; madc.hi.u32 %r484, %r485, %r19, 0; } // end inline asm st.local.u32 [%rd111], %r484; mov.u32 %r488, 4; sub.s32 %r25, %r488, %r20; mov.u32 %r489, 6; sub.s32 %r490, %r489, %r20; mul.wide.s32 %rd576, %r490, 4; add.s64 %rd577, %rd5, %rd576; ld.local.u32 %r1637, [%rd577]; ld.local.u32 %r1638, [%rd577+-4]; and.b32 %r28, %r18, 31; setp.eq.s32 %p147, %r28, 0; @%p147 bra $L__BB0_105; mov.u32 %r491, 32; sub.s32 %r492, %r491, %r28; shr.u32 %r493, %r1638, %r492; shl.b32 %r494, %r1637, %r28; add.s32 %r1637, %r493, %r494; mul.wide.s32 %rd578, %r25, 4; add.s64 %rd579, %rd5, %rd578; ld.local.u32 %r495, [%rd579]; shr.u32 %r496, %r495, %r492; shl.b32 %r497, %r1638, %r28; add.s32 %r1638, %r496, %r497; $L__BB0_105: and.b32 %r498, %r17, -2147483648; shr.u32 %r499, %r1638, 30; shl.b32 %r500, %r1637, 2; or.b32 %r501, %r499, %r500; shr.u32 %r502, %r501, 31; shr.u32 %r503, %r1637, 30; add.s32 %r504, %r502, %r503; neg.s32 %r505, %r504; setp.eq.s32 %p148, %r498, 0; selp.b32 %r1639, %r504, %r505, %p148; setp.ne.s32 %p149, %r502, 0; xor.b32 %r506, %r498, -2147483648; selp.b32 %r507, %r506, %r498, %p149; selp.b32 %r508, -1, 0, %p149; xor.b32 %r509, %r501, %r508; shl.b32 %r510, %r1638, 2; xor.b32 %r511, %r510, %r508; cvt.u64.u32 %rd580, %r509; cvt.u64.u32 %rd581, %r511; bfi.b64 %rd582, %rd580, %rd581, 32, 32; cvt.rn.f64.s64 %fd1, %rd582; mul.f64 %fd2, %fd1, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f1576, %fd2; setp.eq.s32 %p150, %r507, 0; neg.f32 %f1577, %f1576; selp.f32 %f4825, %f1576, %f1577, %p150; $L__BB0_107: mul.f32 %f1579, %f105, 0f3F22F983; cvt.rni.s32.f32 %r1644, %f1579; cvt.rn.f32.s32 %f1580, %r1644; fma.rn.f32 %f1582, %f1580, %f1571, %f105; fma.rn.f32 %f1584, %f1580, %f1573, %f1582; fma.rn.f32 %f4826, %f1580, %f1575, %f1584; abs.f32 %f112, %f105; setp.ltu.f32 %p151, %f112, 0f47CE4780; @%p151 bra $L__BB0_115; setp.eq.f32 %p152, %f112, 0f7F800000; @%p152 bra $L__BB0_114; bra.uni $L__BB0_109; $L__BB0_114: mov.f32 %f1588, 0f00000000; mul.rn.f32 %f4826, %f105, %f1588; mov.u32 %r1644, 0; bra.uni $L__BB0_115; $L__BB0_136: mov.b32 %r78, %f138; shr.u32 %r636, %r78, 23; and.b32 %r637, %r636, 255; add.s32 %r79, %r637, -128; shl.b32 %r638, %r78, 8; or.b32 %r80, %r638, -2147483648; shr.u32 %r81, %r79, 5; mov.u32 %r1650, 0; mov.u64 %rd1145, __cudart_i2opi_f; mov.u64 %rd1146, %rd5; mov.u32 %r1651, %r1650; $L__BB0_137: .pragma "nounroll"; mov.u32 %r83, %r1651; ld.global.nc.u32 %r641, [%rd1145]; // begin inline asm { mad.lo.cc.u32 %r639, %r641, %r80, %r83; madc.hi.u32 %r1651, %r641, %r80, 0; } // end inline asm st.local.u32 [%rd1146], %r639; add.s64 %rd1146, %rd1146, 4; add.s64 %rd1145, %rd1145, 4; add.s32 %r1650, %r1650, 1; setp.ne.s32 %p194, %r1650, 6; @%p194 bra $L__BB0_137; mov.u32 %r646, -1560706194; // begin inline asm { mad.lo.cc.u32 %r644, %r646, %r80, %r83; madc.hi.u32 %r645, %r646, %r80, 0; } // end inline asm st.local.u32 [%rd111], %r645; mov.u32 %r649, 4; sub.s32 %r86, %r649, %r81; mov.u32 %r650, 6; sub.s32 %r651, %r650, %r81; mul.wide.s32 %rd614, %r651, 4; add.s64 %rd615, %rd5, %rd614; ld.local.u32 %r1652, [%rd615]; ld.local.u32 %r1653, [%rd615+-4]; and.b32 %r89, %r79, 31; setp.eq.s32 %p195, %r89, 0; @%p195 bra $L__BB0_140; mov.u32 %r652, 32; sub.s32 %r653, %r652, %r89; shr.u32 %r654, %r1653, %r653; shl.b32 %r655, %r1652, %r89; add.s32 %r1652, %r654, %r655; mul.wide.s32 %rd616, %r86, 4; add.s64 %rd617, %rd5, %rd616; ld.local.u32 %r656, [%rd617]; shr.u32 %r657, %r656, %r653; shl.b32 %r658, %r1653, %r89; add.s32 %r1653, %r657, %r658; $L__BB0_140: and.b32 %r659, %r78, -2147483648; shr.u32 %r660, %r1653, 30; shl.b32 %r661, %r1652, 2; or.b32 %r662, %r660, %r661; shr.u32 %r663, %r662, 31; shr.u32 %r664, %r1652, 30; add.s32 %r665, %r663, %r664; neg.s32 %r666, %r665; setp.eq.s32 %p196, %r659, 0; selp.b32 %r1654, %r665, %r666, %p196; setp.ne.s32 %p197, %r663, 0; xor.b32 %r667, %r659, -2147483648; selp.b32 %r668, %r667, %r659, %p197; selp.b32 %r669, -1, 0, %p197; xor.b32 %r670, %r662, %r669; shl.b32 %r671, %r1653, 2; xor.b32 %r672, %r671, %r669; cvt.u64.u32 %rd618, %r670; cvt.u64.u32 %rd619, %r672; bfi.b64 %rd620, %rd618, %rd619, 32, 32; cvt.rn.f64.s64 %fd7, %rd620; mul.f64 %fd8, %fd7, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f1773, %fd8; setp.eq.s32 %p198, %r668, 0; neg.f32 %f1774, %f1773; selp.f32 %f4830, %f1773, %f1774, %p198; $L__BB0_142: setp.lt.f32 %p199, %f124, 0f00000000; mov.f32 %f1776, 0f00000000; selp.f32 %f1777, 0fBF800000, 0f3F800000, %p199; mov.f32 %f1778, 0f3F800000; mul.f32 %f149, %f124, %f1777; mul.f32 %f1779, %f4829, %f4829; mov.f32 %f1780, 0fBAB607ED; mov.f32 %f1781, 0f37CBAC00; fma.rn.f32 %f1782, %f1781, %f1779, %f1780; mov.f32 %f1783, 0f3D2AAABB; fma.rn.f32 %f1784, %f1782, %f1779, %f1783; mov.f32 %f1785, 0fBEFFFFFF; fma.rn.f32 %f1786, %f1784, %f1779, %f1785; fma.rn.f32 %f1787, %f1786, %f1779, %f1778; mov.f32 %f1788, 0f3C0885E4; mov.f32 %f1789, 0fB94D4153; fma.rn.f32 %f1790, %f1789, %f1779, %f1788; mov.f32 %f1791, 0fBE2AAAA8; fma.rn.f32 %f1792, %f1790, %f1779, %f1791; fma.rn.f32 %f1793, %f1779, %f4829, %f1776; fma.rn.f32 %f1794, %f1792, %f1793, %f4829; and.b32 %r674, %r1649, 1; setp.eq.b32 %p200, %r674, 1; selp.f32 %f1795, %f1787, %f1794, %p200; selp.f32 %f1796, %f1794, %f1787, %p200; neg.f32 %f1797, %f1795; and.b32 %r675, %r1649, 2; setp.eq.s32 %p201, %r675, 0; selp.f32 %f1798, %f1795, %f1797, %p201; neg.f32 %f1799, %f1796; add.s32 %r676, %r1649, 1; and.b32 %r677, %r676, 2; setp.eq.s32 %p202, %r677, 0; selp.f32 %f1800, %f1796, %f1799, %p202; mul.f32 %f1801, %f4830, %f4830; fma.rn.f32 %f1802, %f1781, %f1801, %f1780; fma.rn.f32 %f1803, %f1802, %f1801, %f1783; fma.rn.f32 %f1804, %f1803, %f1801, %f1785; fma.rn.f32 %f1805, %f1804, %f1801, %f1778; fma.rn.f32 %f1806, %f1801, %f4830, %f1776; fma.rn.f32 %f1807, %f1789, %f1801, %f1788; fma.rn.f32 %f1808, %f1807, %f1801, %f1791; fma.rn.f32 %f1809, %f1808, %f1806, %f4830; and.b32 %r678, %r1654, 1; setp.eq.b32 %p203, %r678, 1; selp.f32 %f1810, %f1805, %f1809, %p203; selp.f32 %f1811, %f1809, %f1805, %p203; and.b32 %r679, %r1654, 2; setp.eq.s32 %p204, %r679, 0; neg.f32 %f1812, %f1810; selp.f32 %f1813, %f1810, %f1812, %p204; add.s32 %r680, %r1654, 1; and.b32 %r681, %r680, 2; setp.eq.s32 %p205, %r681, 0; neg.f32 %f1814, %f1811; selp.f32 %f1815, %f1811, %f1814, %p205; mov.b32 %r682, %f1815; neg.f32 %f1816, %f1813; mov.b32 %r683, %f1813; cvt.u64.u32 %rd621, %r683; cvt.u64.u32 %rd622, %r682; bfi.b64 %rd130, %rd621, %rd622, 32, 32; mov.b32 %r684, %f1816; cvt.u64.u32 %rd623, %r684; bfi.b64 %rd131, %rd622, %rd623, 32, 32; mul.f32 %f1817, %f1777, %f1798; mov.b32 %r685, %f1817; cvt.u64.u32 %rd624, %r685; mov.b32 %r686, %f1800; cvt.u64.u32 %rd625, %r686; bfi.b64 %rd132, %rd624, %rd625, 32, 32; neg.f32 %f1818, %f1798; mov.b32 %r687, %f1818; mul.f32 %f1819, %f1777, %f1800; mov.b32 %r688, %f1819; cvt.u64.u32 %rd626, %r688; cvt.u64.u32 %rd627, %r687; bfi.b64 %rd133, %rd626, %rd627, 32, 32; mul.f32 %f1820, %f123, 0f4B000000; setp.lt.f32 %p206, %f123, 0f00800000; selp.f32 %f150, %f1820, %f123, %p206; selp.f32 %f1821, 0fC1B80000, 0f00000000, %p206; mov.b32 %r689, %f150; add.s32 %r690, %r689, -1059760811; and.b32 %r691, %r690, -8388608; sub.s32 %r692, %r689, %r691; mov.b32 %f1822, %r692; cvt.rn.f32.s32 %f1823, %r691; mov.f32 %f1824, 0f34000000; fma.rn.f32 %f1825, %f1823, %f1824, %f1821; add.f32 %f1826, %f1822, 0fBF800000; mov.f32 %f1827, 0f3E1039F6; mov.f32 %f1828, 0fBE055027; fma.rn.f32 %f1829, %f1828, %f1826, %f1827; mov.f32 %f1830, 0fBDF8CDCC; fma.rn.f32 %f1831, %f1829, %f1826, %f1830; mov.f32 %f1832, 0f3E0F2955; fma.rn.f32 %f1833, %f1831, %f1826, %f1832; mov.f32 %f1834, 0fBE2AD8B9; fma.rn.f32 %f1835, %f1833, %f1826, %f1834; mov.f32 %f1836, 0f3E4CED0B; fma.rn.f32 %f1837, %f1835, %f1826, %f1836; mov.f32 %f1838, 0fBE7FFF22; fma.rn.f32 %f1839, %f1837, %f1826, %f1838; mov.f32 %f1840, 0f3EAAAA78; fma.rn.f32 %f1841, %f1839, %f1826, %f1840; mov.f32 %f1842, 0fBF000000; fma.rn.f32 %f1843, %f1841, %f1826, %f1842; mul.f32 %f1844, %f1826, %f1843; fma.rn.f32 %f1845, %f1844, %f1826, %f1826; mov.f32 %f1846, 0f3F317218; fma.rn.f32 %f4831, %f1825, %f1846, %f1845; setp.lt.u32 %p207, %r689, 2139095040; @%p207 bra $L__BB0_144; mov.f32 %f1847, 0f7F800000; fma.rn.f32 %f4831, %f150, %f1847, %f1847; $L__BB0_144: setp.eq.f32 %p208, %f150, 0f00000000; selp.f32 %f154, 0fFF800000, %f4831, %p208; mul.f32 %f1848, %f149, 0f4B000000; setp.lt.f32 %p209, %f149, 0f00800000; selp.f32 %f155, %f1848, %f149, %p209; selp.f32 %f1849, 0fC1B80000, 0f00000000, %p209; mov.b32 %r693, %f155; add.s32 %r694, %r693, -1059760811; and.b32 %r695, %r694, -8388608; sub.s32 %r696, %r693, %r695; mov.b32 %f1850, %r696; cvt.rn.f32.s32 %f1851, %r695; fma.rn.f32 %f1853, %f1851, %f1824, %f1849; add.f32 %f1854, %f1850, 0fBF800000; fma.rn.f32 %f1857, %f1828, %f1854, %f1827; fma.rn.f32 %f1859, %f1857, %f1854, %f1830; fma.rn.f32 %f1861, %f1859, %f1854, %f1832; fma.rn.f32 %f1863, %f1861, %f1854, %f1834; fma.rn.f32 %f1865, %f1863, %f1854, %f1836; fma.rn.f32 %f1867, %f1865, %f1854, %f1838; fma.rn.f32 %f1869, %f1867, %f1854, %f1840; fma.rn.f32 %f1871, %f1869, %f1854, %f1842; mul.f32 %f1872, %f1854, %f1871; fma.rn.f32 %f1873, %f1872, %f1854, %f1854; fma.rn.f32 %f4832, %f1853, %f1846, %f1873; setp.lt.u32 %p210, %r693, 2139095040; @%p210 bra $L__BB0_146; mov.f32 %f1875, 0f7F800000; fma.rn.f32 %f4832, %f155, %f1875, %f1875; $L__BB0_146: add.u64 %rd1080, %SPL, 32; setp.eq.f32 %p211, %f155, 0f00000000; selp.f32 %f159, 0fFF800000, %f4832, %p211; mov.b32 %r697, %f159; cvt.u64.u32 %rd630, %r697; mov.b32 %r698, %f154; cvt.u64.u32 %rd631, %r698; bfi.b64 %rd632, %rd630, %rd631, 32, 32; mov.u64 %rd1147, 0; st.local.u64 [%rd4], %rd632; st.local.u64 [%rd1], %rd1147; mov.u64 %rd1151, 1; st.local.u64 [%rd1080], %rd1151; setp.le.f32 %p212, %f159, %f154; setp.ge.f32 %p213, %f159, %f154; selp.b16 %rs15, 1, 2, %p213; setp.ltu.f32 %p214, %f159, %f154; selp.b16 %rs16, -1, 0, %p214; selp.b16 %rs17, %rs16, %rs15, %p212; setp.ne.s16 %p215, %rs17, -1; mov.f32 %f4833, %f159; mov.u64 %rd1148, %rd1151; @%p215 bra $L__BB0_148; add.u64 %rd1088, %SPL, 32; mov.u64 %rd1148, 0; st.local.u64 [%rd1088], %rd1148; mov.u64 %rd1147, 1; st.local.u64 [%rd1], %rd1147; mov.f32 %f4833, %f154; $L__BB0_148: setp.ge.f32 %p216, %f159, %f4833; selp.b16 %rs18, 1, 2, %p216; setp.ltu.f32 %p217, %f159, %f4833; selp.b16 %rs19, -1, 0, %p217; setp.le.f32 %p218, %f159, %f4833; selp.b16 %rs20, %rs19, %rs18, %p218; setp.ne.s16 %p219, %rs20, -1; mov.u64 %rd1150, %rd1148; @%p219 bra $L__BB0_152; add.u64 %rd1149, %SPL, 32; shl.b64 %rd636, %rd1147, 2; add.s64 %rd637, %rd4, %rd636; ld.local.f32 %f1876, [%rd637]; setp.le.f32 %p220, %f159, %f1876; setp.ge.f32 %p221, %f159, %f1876; selp.b16 %rs21, 1, 2, %p221; setp.ltu.f32 %p222, %f159, %f1876; selp.b16 %rs22, -1, 0, %p222; selp.b16 %rs23, %rs22, %rs21, %p220; setp.ne.s16 %p223, %rs23, -1; @%p223 bra $L__BB0_151; add.u64 %rd1086, %SPL, 32; st.local.u64 [%rd1086], %rd1147; mov.u64 %rd1149, %rd1; $L__BB0_151: add.u64 %rd1084, %SPL, 32; mov.u64 %rd638, 1; st.local.u64 [%rd1149], %rd638; ld.local.u64 %rd1150, [%rd1084]; mov.u64 %rd1151, %rd1148; $L__BB0_152: ld.f32 %f161, [%rd121]; add.f32 %f1877, %f161, 0fBF800000; ld.global.f32 %f162, [%rd81+48]; sub.f32 %f163, %f162, %f1877; add.f32 %f1878, %f154, 0f00000000; add.f32 %f164, %f1878, %f159; shl.b64 %rd639, %rd1151, 2; add.s64 %rd140, %rd4, %rd639; ld.local.f32 %f165, [%rd140]; add.f32 %f166, %f120, %f120; mul.f32 %f1879, %f166, %f165; fma.rn.f32 %f1880, %f119, %f164, %f1879; setp.gtu.f32 %p224, %f1880, %f163; @%p224 bra $L__BB0_154; bra.uni $L__BB0_297; $L__BB0_154: add.f32 %f167, %f119, %f166; setp.gt.u64 %p225, %rd1150, 1; @%p225 bra $L__BB0_159; shl.b64 %rd640, %rd1150, 2; add.s64 %rd641, %rd4, %rd640; ld.local.f32 %f1881, [%rd641]; sub.f32 %f1882, %f164, %f165; mul.f32 %f168, %f119, %f1882; fma.rn.f32 %f1883, %f167, %f1881, %f168; setp.gtu.f32 %p226, %f1883, %f163; @%p226 bra $L__BB0_157; bra.uni $L__BB0_156; $L__BB0_157: fma.rn.f32 %f1886, %f119, 0f40400000, %f166; div.rn.f32 %f4834, %f163, %f1886; mov.b32 %r699, %f4834; st.local.v2.f32 [%rd4], {%f4834, %f4834}; mov.b64 %rd1152, {%r699, %r699}; bra.uni $L__BB0_158; $L__BB0_156: sub.f32 %f1884, %f163, %f168; div.rn.f32 %f1885, %f1884, %f167; st.local.f32 [%rd140], %f1885; ld.local.f32 %f4834, [%rd4+4]; ld.local.u64 %rd1152, [%rd4]; $L__BB0_158: cvt.u32.u64 %r700, %rd1152; mov.b32 %f1887, %r700; shr.u64 %rd642, %rd1152, 32; cvt.u32.u64 %r701, %rd642; mov.b32 %f1888, %r701; sub.f32 %f1889, %f154, %f1887; sub.f32 %f1890, %f159, %f1888; mul.f32 %f1891, %f1890, %f1890; fma.rn.f32 %f1892, %f1889, %f1889, %f1891; add.f32 %f1893, %f1892, 0f00000000; sqrt.rn.f32 %f1894, %f1893; ld.global.f32 %f1895, [%rd81+52]; fma.rn.f32 %f1896, %f1895, %f1894, %f161; min.f32 %f1897, %f1896, %f162; st.f32 [%rd121], %f1897; mov.f32 %f1898, 0f3F000000; mov.f32 %f1899, 0f3BBB989D; fma.rn.f32 %f1900, %f1887, %f1899, %f1898; mov.f32 %f1901, 0f3FB8AA3B; mov.f32 %f1902, 0f437C0000; cvt.sat.f32.f32 %f1903, %f1900; mov.f32 %f1904, 0f4B400001; fma.rm.f32 %f1905, %f1903, %f1902, %f1904; add.f32 %f1906, %f1905, 0fCB40007F; neg.f32 %f1907, %f1906; fma.rn.f32 %f1908, %f1887, %f1901, %f1907; mov.f32 %f1909, 0f32A57060; fma.rn.f32 %f1910, %f1887, %f1909, %f1908; mov.b32 %r702, %f1905; shl.b32 %r703, %r702, 23; mov.b32 %f1911, %r703; ex2.approx.ftz.f32 %f1912, %f1910; mul.f32 %f1913, %f1912, %f1911; fma.rn.f32 %f1914, %f4834, %f1899, %f1898; cvt.sat.f32.f32 %f1915, %f1914; fma.rm.f32 %f1916, %f1915, %f1902, %f1904; add.f32 %f1917, %f1916, 0fCB40007F; neg.f32 %f1918, %f1917; fma.rn.f32 %f1919, %f4834, %f1901, %f1918; fma.rn.f32 %f1920, %f4834, %f1909, %f1919; mov.b32 %r704, %f1916; shl.b32 %r705, %r704, 23; mov.b32 %f1921, %r705; ex2.approx.ftz.f32 %f1922, %f1920; mul.f32 %f1923, %f1922, %f1921; mov.b64 {%r706, %r707}, %rd131; mov.b64 {%r708, %r709}, %rd130; mov.b32 %f1924, %r708; mul.f32 %f1925, %f1924, %f1913; mov.b32 %f1926, %r709; mul.f32 %f1927, %f1926, %f1913; mov.b32 %f1928, %r706; mul.f32 %f1929, %f1928, %f1923; mov.b32 %f1930, %r707; mul.f32 %f1931, %f1930, %f1923; mov.b64 {%r710, %r711}, %rd133; mov.b64 {%r712, %r713}, %rd132; mov.b32 %f1932, %r712; mov.b32 %f1933, %r713; mul.f32 %f1934, %f1933, %f1929; mul.f32 %f1935, %f1933, %f1931; fma.rn.f32 %f4864, %f1932, %f1927, %f1935; mov.b32 %f1936, %r710; mov.b32 %f1937, %r711; mul.f32 %f1938, %f1937, %f1929; fma.rn.f32 %f4863, %f1936, %f1925, %f1938; mul.f32 %f1939, %f1937, %f1931; fma.rn.f32 %f1940, %f1936, %f1927, %f1939; fma.rn.f32 %f1941, %f1932, %f1925, %f1934; st.local.v4.f32 [%rd77], {%f1941, %f4864, %f4863, %f1940}; bra.uni $L__BB0_297; $L__BB0_109: mov.b32 %r36, %f105; shr.u32 %r515, %r36, 23; and.b32 %r516, %r515, 255; add.s32 %r37, %r516, -128; shl.b32 %r517, %r36, 8; or.b32 %r38, %r517, -2147483648; shr.u32 %r39, %r37, 5; mov.u32 %r1640, 0; mov.u64 %rd1141, __cudart_i2opi_f; mov.u64 %rd1142, %rd5; mov.u32 %r1641, %r1640; $L__BB0_110: .pragma "nounroll"; mov.u32 %r41, %r1641; ld.global.nc.u32 %r520, [%rd1141]; // begin inline asm { mad.lo.cc.u32 %r518, %r520, %r38, %r41; madc.hi.u32 %r1641, %r520, %r38, 0; } // end inline asm st.local.u32 [%rd1142], %r518; add.s64 %rd1142, %rd1142, 4; add.s64 %rd1141, %rd1141, 4; add.s32 %r1640, %r1640, 1; setp.ne.s32 %p153, %r1640, 6; @%p153 bra $L__BB0_110; mov.u32 %r525, -1560706194; // begin inline asm { mad.lo.cc.u32 %r523, %r525, %r38, %r41; madc.hi.u32 %r524, %r525, %r38, 0; } // end inline asm st.local.u32 [%rd111], %r524; mov.u32 %r528, 4; sub.s32 %r44, %r528, %r39; mov.u32 %r529, 6; sub.s32 %r530, %r529, %r39; mul.wide.s32 %rd584, %r530, 4; add.s64 %rd585, %rd5, %rd584; ld.local.u32 %r1642, [%rd585]; ld.local.u32 %r1643, [%rd585+-4]; and.b32 %r47, %r37, 31; setp.eq.s32 %p154, %r47, 0; @%p154 bra $L__BB0_113; mov.u32 %r531, 32; sub.s32 %r532, %r531, %r47; shr.u32 %r533, %r1643, %r532; shl.b32 %r534, %r1642, %r47; add.s32 %r1642, %r533, %r534; mul.wide.s32 %rd586, %r44, 4; add.s64 %rd587, %rd5, %rd586; ld.local.u32 %r535, [%rd587]; shr.u32 %r536, %r535, %r532; shl.b32 %r537, %r1643, %r47; add.s32 %r1643, %r536, %r537; $L__BB0_113: and.b32 %r538, %r36, -2147483648; shr.u32 %r539, %r1643, 30; shl.b32 %r540, %r1642, 2; or.b32 %r541, %r539, %r540; shr.u32 %r542, %r541, 31; shr.u32 %r543, %r1642, 30; add.s32 %r544, %r542, %r543; neg.s32 %r545, %r544; setp.eq.s32 %p155, %r538, 0; selp.b32 %r1644, %r544, %r545, %p155; setp.ne.s32 %p156, %r542, 0; xor.b32 %r546, %r538, -2147483648; selp.b32 %r547, %r546, %r538, %p156; selp.b32 %r548, -1, 0, %p156; xor.b32 %r549, %r541, %r548; shl.b32 %r550, %r1643, 2; xor.b32 %r551, %r550, %r548; cvt.u64.u32 %rd588, %r549; cvt.u64.u32 %rd589, %r551; bfi.b64 %rd590, %rd588, %rd589, 32, 32; cvt.rn.f64.s64 %fd3, %rd590; mul.f64 %fd4, %fd3, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f1586, %fd4; setp.eq.s32 %p157, %r547, 0; neg.f32 %f1587, %f1586; selp.f32 %f4826, %f1586, %f1587, %p157; $L__BB0_115: setp.lt.f32 %p158, %f91, 0f00000000; mov.f32 %f1589, 0f00000000; selp.f32 %f1590, 0fBF800000, 0f3F800000, %p158; mov.f32 %f1591, 0f3F800000; mul.f32 %f1592, %f91, %f1590; mul.f32 %f1593, %f4825, %f4825; mov.f32 %f1594, 0fBAB607ED; mov.f32 %f1595, 0f37CBAC00; fma.rn.f32 %f1596, %f1595, %f1593, %f1594; mov.f32 %f1597, 0f3D2AAABB; fma.rn.f32 %f1598, %f1596, %f1593, %f1597; mov.f32 %f1599, 0fBEFFFFFF; fma.rn.f32 %f1600, %f1598, %f1593, %f1599; fma.rn.f32 %f1601, %f1600, %f1593, %f1591; mov.f32 %f1602, 0f3C0885E4; mov.f32 %f1603, 0fB94D4153; fma.rn.f32 %f1604, %f1603, %f1593, %f1602; mov.f32 %f1605, 0fBE2AAAA8; fma.rn.f32 %f1606, %f1604, %f1593, %f1605; fma.rn.f32 %f1607, %f1593, %f4825, %f1589; fma.rn.f32 %f1608, %f1606, %f1607, %f4825; and.b32 %r553, %r1639, 1; setp.eq.b32 %p159, %r553, 1; selp.f32 %f1609, %f1601, %f1608, %p159; selp.f32 %f1610, %f1608, %f1601, %p159; neg.f32 %f1611, %f1609; and.b32 %r554, %r1639, 2; setp.eq.s32 %p160, %r554, 0; selp.f32 %f1612, %f1609, %f1611, %p160; neg.f32 %f1613, %f1610; add.s32 %r555, %r1639, 1; and.b32 %r556, %r555, 2; setp.eq.s32 %p161, %r556, 0; selp.f32 %f1614, %f1610, %f1613, %p161; mul.f32 %f1615, %f4826, %f4826; fma.rn.f32 %f1616, %f1595, %f1615, %f1594; fma.rn.f32 %f1617, %f1616, %f1615, %f1597; fma.rn.f32 %f1618, %f1617, %f1615, %f1599; fma.rn.f32 %f1619, %f1618, %f1615, %f1591; fma.rn.f32 %f1620, %f1615, %f4826, %f1589; fma.rn.f32 %f1621, %f1603, %f1615, %f1602; fma.rn.f32 %f1622, %f1621, %f1615, %f1605; fma.rn.f32 %f1623, %f1622, %f1620, %f4826; and.b32 %r557, %r1644, 1; setp.eq.b32 %p162, %r557, 1; selp.f32 %f1624, %f1619, %f1623, %p162; selp.f32 %f1625, %f1623, %f1619, %p162; and.b32 %r558, %r1644, 2; setp.eq.s32 %p163, %r558, 0; neg.f32 %f1626, %f1624; selp.f32 %f1627, %f1624, %f1626, %p163; add.s32 %r559, %r1644, 1; and.b32 %r560, %r559, 2; setp.eq.s32 %p164, %r560, 0; neg.f32 %f1628, %f1625; selp.f32 %f1629, %f1625, %f1628, %p164; mov.b32 %r561, %f1629; neg.f32 %f1630, %f1627; mov.b32 %r562, %f1627; cvt.u64.u32 %rd591, %r562; mov.b32 %r563, %f1630; cvt.u64.u32 %rd592, %r563; cvt.u64.u32 %rd593, %r561; bfi.b64 %rd594, %rd593, %rd592, 32, 32; mov.b64 {%r564, %r565}, %rd594; bfi.b64 %rd595, %rd591, %rd593, 32, 32; mov.b64 {%r566, %r567}, %rd595; mul.f32 %f1631, %f1590, %f1612; mov.b32 %r568, %f1631; cvt.u64.u32 %rd596, %r568; mov.b32 %r569, %f1614; cvt.u64.u32 %rd597, %r569; neg.f32 %f1632, %f1612; mov.b32 %r570, %f1632; mul.f32 %f1633, %f1590, %f1614; mov.b32 %r571, %f1633; cvt.u64.u32 %rd598, %r571; cvt.u64.u32 %rd599, %r570; bfi.b64 %rd600, %rd598, %rd599, 32, 32; mov.b64 {%r572, %r573}, %rd600; bfi.b64 %rd601, %rd596, %rd597, 32, 32; mov.b64 {%r574, %r575}, %rd601; ld.global.f32 %f1634, [%rd81+40]; sub.f32 %f1635, %f1591, %f1634; max.f32 %f1636, %f90, %f1635; ld.global.f32 %f1637, [%rd81+44]; add.f32 %f1638, %f1637, 0f3F800000; min.f32 %f1639, %f1636, %f1638; max.f32 %f1640, %f1592, %f1635; min.f32 %f1641, %f1640, %f1638; mul.f32 %f1642, %f1639, %f1641; mul.f32 %f1643, %f90, %f1592; div.rn.f32 %f1644, %f1643, %f1642; mul.f32 %f4865, %f4865, %f1644; sub.f32 %f1645, %f1591, %f4865; ld.global.f32 %f1646, [%rd81+48]; mul.f32 %f1647, %f1646, %f1645; mov.f32 %f1648, 0f3F000000; mov.f32 %f1649, 0f3BBB989D; fma.rn.f32 %f1650, %f1647, %f1649, %f1648; mov.f32 %f1651, 0f3FB8AA3B; mov.f32 %f1652, 0f437C0000; cvt.sat.f32.f32 %f1653, %f1650; mov.f32 %f1654, 0f4B400001; fma.rm.f32 %f1655, %f1653, %f1652, %f1654; add.f32 %f1656, %f1655, 0fCB40007F; neg.f32 %f1657, %f1656; fma.rn.f32 %f1658, %f1647, %f1651, %f1657; mov.f32 %f1659, 0f32A57060; fma.rn.f32 %f1660, %f1647, %f1659, %f1658; mov.b32 %r576, %f1655; shl.b32 %r577, %r576, 23; mov.b32 %f1661, %r577; ex2.approx.ftz.f32 %f1662, %f1660; mul.f32 %f1663, %f1662, %f1661; st.f32 [%rd112], %f1663; mov.b32 %f1664, %r566; mul.f32 %f1665, %f1639, %f1664; mov.b32 %f1666, %r567; mul.f32 %f1667, %f1639, %f1666; mov.b32 %f1668, %r564; mul.f32 %f1669, %f1641, %f1668; mov.b32 %f1670, %r565; mul.f32 %f1671, %f1641, %f1670; mov.b32 %f1672, %r574; mov.b32 %f1673, %r575; mul.f32 %f1674, %f1673, %f1669; mul.f32 %f1675, %f1673, %f1671; fma.rn.f32 %f4864, %f1672, %f1667, %f1675; mov.b32 %f1676, %r572; mov.b32 %f1677, %r573; mul.f32 %f1678, %f1677, %f1669; fma.rn.f32 %f4863, %f1676, %f1665, %f1678; mul.f32 %f1679, %f1677, %f1671; fma.rn.f32 %f1680, %f1676, %f1667, %f1679; fma.rn.f32 %f1681, %f1672, %f1665, %f1674; st.local.v4.f32 [%rd77], {%f1681, %f4864, %f4863, %f1680}; bra.uni $L__BB0_297; $L__BB0_306: setp.eq.f32 %p428, %f380, 0f00000000; setp.eq.f32 %p429, %f388, 0f7F800000; or.pred %p430, %p428, %p429; @%p430 bra $L__BB0_309; bra.uni $L__BB0_307; $L__BB0_309: setp.eq.f32 %p433, %f387, 0f3F800000; add.f32 %f3059, %f380, %f380; mov.b32 %r1138, %f3059; xor.b32 %r1139, %r1138, 2139095040; and.b32 %r1140, %r1139, 2147483647; selp.b32 %r1141, %r1139, %r1140, %p433; mov.b32 %f4870, %r1141; bra.uni $L__BB0_311; $L__BB0_242: setp.lt.s32 %p349, %r138, 0; min.f32 %f2613, %f296, %f295; max.f32 %f2614, %f296, %f295; div.rn.f32 %f2615, %f2613, %f2614; mul.rn.f32 %f2616, %f2615, %f2615; mov.f32 %f2617, 0fC0B59883; mov.f32 %f2618, 0fBF52C7EA; fma.rn.f32 %f2619, %f2616, %f2618, %f2617; mov.f32 %f2620, 0fC0D21907; fma.rn.f32 %f2621, %f2619, %f2616, %f2620; mul.f32 %f2622, %f2616, %f2621; mul.f32 %f2623, %f2615, %f2622; add.f32 %f2624, %f2616, 0f41355DC0; mov.f32 %f2625, 0f41E6BD60; fma.rn.f32 %f2626, %f2624, %f2616, %f2625; mov.f32 %f2627, 0f419D92C8; fma.rn.f32 %f2628, %f2626, %f2616, %f2627; rcp.rn.f32 %f2629, %f2628; fma.rn.f32 %f2630, %f2623, %f2629, %f2615; mov.f32 %f2631, 0f3FC90FDB; sub.f32 %f2632, %f2631, %f2630; setp.gt.f32 %p350, %f296, %f295; selp.f32 %f2633, %f2632, %f2630, %p350; mov.f32 %f2634, 0f40490FDB; sub.f32 %f2635, %f2634, %f2633; selp.f32 %f2636, %f2635, %f2633, %p349; mov.b32 %r936, %f2636; or.b32 %r937, %r139, %r936; mov.b32 %f2637, %r937; add.f32 %f2638, %f295, %f296; setp.le.f32 %p351, %f2638, 0f7F800000; selp.f32 %f4852, %f2637, %f2638, %p351; $L__BB0_245: abs.f32 %f301, %f290; setp.eq.f32 %p353, %f301, 0f00000000; abs.f32 %f302, %f291; setp.eq.f32 %p354, %f302, 0f00000000; and.pred %p355, %p353, %p354; mov.b32 %r140, %f290; mov.b32 %r943, %f291; and.b32 %r141, %r943, -2147483648; @%p355 bra $L__BB0_249; bra.uni $L__BB0_246; $L__BB0_249: shr.s32 %r948, %r140, 31; and.b32 %r949, %r948, 1078530011; or.b32 %r950, %r949, %r141; mov.b32 %f4853, %r950; bra.uni $L__BB0_250; $L__BB0_246: setp.eq.f32 %p356, %f301, 0f7F800000; setp.eq.f32 %p357, %f302, 0f7F800000; and.pred %p358, %p356, %p357; @%p358 bra $L__BB0_248; bra.uni $L__BB0_247; $L__BB0_248: setp.lt.s32 %p362, %r140, 0; selp.b32 %r946, 1075235812, 1061752795, %p362; or.b32 %r947, %r946, %r141; mov.b32 %f4853, %r947; bra.uni $L__BB0_250; $L__BB0_247: setp.lt.s32 %p359, %r140, 0; min.f32 %f2639, %f302, %f301; max.f32 %f2640, %f302, %f301; div.rn.f32 %f2641, %f2639, %f2640; mul.rn.f32 %f2642, %f2641, %f2641; mov.f32 %f2643, 0fC0B59883; mov.f32 %f2644, 0fBF52C7EA; fma.rn.f32 %f2645, %f2642, %f2644, %f2643; mov.f32 %f2646, 0fC0D21907; fma.rn.f32 %f2647, %f2645, %f2642, %f2646; mul.f32 %f2648, %f2642, %f2647; mul.f32 %f2649, %f2641, %f2648; add.f32 %f2650, %f2642, 0f41355DC0; mov.f32 %f2651, 0f41E6BD60; fma.rn.f32 %f2652, %f2650, %f2642, %f2651; mov.f32 %f2653, 0f419D92C8; fma.rn.f32 %f2654, %f2652, %f2642, %f2653; rcp.rn.f32 %f2655, %f2654; fma.rn.f32 %f2656, %f2649, %f2655, %f2641; mov.f32 %f2657, 0f3FC90FDB; sub.f32 %f2658, %f2657, %f2656; setp.gt.f32 %p360, %f302, %f301; selp.f32 %f2659, %f2658, %f2656, %p360; mov.f32 %f2660, 0f40490FDB; sub.f32 %f2661, %f2660, %f2659; selp.f32 %f2662, %f2661, %f2659, %p359; mov.b32 %r944, %f2662; or.b32 %r945, %r141, %r944; mov.b32 %f2663, %r945; add.f32 %f2664, %f301, %f302; setp.le.f32 %p361, %f2664, 0f7F800000; selp.f32 %f4853, %f2663, %f2664, %p361; $L__BB0_250: sub.f32 %f2665, %f4853, %f4852; mul.f32 %f307, %f2665, 0f3F000000; add.f32 %f2666, %f4852, %f4853; mul.f32 %f308, %f2666, 0f3F000000; mul.f32 %f2667, %f307, 0f3F22F983; cvt.rni.s32.f32 %r1669, %f2667; cvt.rn.f32.s32 %f2668, %r1669; mov.f32 %f2669, 0fBFC90FDA; fma.rn.f32 %f2670, %f2668, %f2669, %f307; mov.f32 %f2671, 0fB3A22168; fma.rn.f32 %f2672, %f2668, %f2671, %f2670; mov.f32 %f2673, 0fA7C234C5; fma.rn.f32 %f4854, %f2668, %f2673, %f2672; abs.f32 %f310, %f307; setp.ltu.f32 %p363, %f310, 0f47CE4780; @%p363 bra $L__BB0_258; setp.eq.f32 %p364, %f310, 0f7F800000; @%p364 bra $L__BB0_257; bra.uni $L__BB0_252; $L__BB0_257: mov.f32 %f2676, 0f00000000; mul.rn.f32 %f4854, %f307, %f2676; mov.u32 %r1669, 0; bra.uni $L__BB0_258; $L__BB0_192: setp.geu.f32 %p280, %f219, 0f00000000; mov.f32 %f4840, %f223; @%p280 bra $L__BB0_196; setp.eq.f32 %p281, %f218, 0f3F800000; neg.f32 %f2180, %f223; selp.f32 %f4840, %f2180, %f223, %p281; $L__BB0_196: mul.f32 %f228, %f209, 0f3F000000; sub.f32 %f2183, %f207, %f228; sub.f32 %f2184, %f208, %f228; mul.f32 %f2185, %f217, %f4840; mul.f32 %f229, %f2183, %f2185; mul.f32 %f230, %f2184, %f2185; div.rn.f32 %f2187, %f2116, %f219; add.f32 %f2188, %f219, %f2187; mul.f32 %f2189, %f210, 0f3F000000; mul.f32 %f2190, %f2188, %f2189; mul.f32 %f231, %f219, %f2190; neg.f32 %f232, %f231; setp.lt.f32 %p283, %f216, %f232; @%p283 bra $L__BB0_226; bra.uni $L__BB0_197; $L__BB0_226: mul.f32 %f2461, %f216, 0fC0000000; div.rn.f32 %f2462, %f2461, %f210; add.f32 %f2463, %f2462, 0f3F800000; mov.f32 %f4848, 0f3F800000; sqrt.rn.f32 %f275, %f2463; abs.f32 %f276, %f275; setp.eq.f32 %p324, %f275, 0f3F800000; @%p324 bra $L__BB0_233; setp.gtu.f32 %p325, %f276, 0f7F800000; @%p325 bra $L__BB0_232; bra.uni $L__BB0_228; $L__BB0_232: mov.f32 %f2540, 0f3F000000; add.rn.f32 %f4848, %f275, %f2540; bra.uni $L__BB0_233; $L__BB0_197: mul.f32 %f233, %f176, %f216; setp.gt.f32 %p284, %f231, %f233; add.f32 %f234, %f176, %f176; @%p284 bra $L__BB0_214; bra.uni $L__BB0_198; $L__BB0_214: mul.f32 %f2330, %f234, %f216; div.rn.f32 %f2331, %f2330, %f210; add.f32 %f2332, %f2331, 0f3F800000; mov.f32 %f4845, 0f3F800000; sqrt.rn.f32 %f263, %f2332; abs.f32 %f264, %f263; setp.eq.f32 %p309, %f263, 0f3F800000; @%p309 bra $L__BB0_221; setp.gtu.f32 %p310, %f264, 0f7F800000; @%p310 bra $L__BB0_220; bra.uni $L__BB0_216; $L__BB0_220: mov.f32 %f2409, 0f3F000000; add.rn.f32 %f4845, %f263, %f2409; bra.uni $L__BB0_221; $L__BB0_198: add.f32 %f235, %f234, 0f3F800000; add.f32 %f236, %f235, %f235; sub.f32 %f2191, %f233, %f231; mul.f32 %f237, %f177, %f177; mul.f32 %f2192, %f237, %f2191; sub.f32 %f2193, %f232, %f216; mul.f32 %f238, %f2193, %f2192; mul.f32 %f2194, %f230, %f230; fma.rn.f32 %f2195, %f229, %f229, %f2194; add.f32 %f239, %f2195, 0f00000000; fma.rn.f32 %f2196, %f236, %f239, %f238; setp.lt.f32 %p285, %f2196, 0f38D1B717; @%p285 bra $L__BB0_238; ld.global.u8 %rs24, [%rd81+48]; setp.eq.s16 %p286, %rs24, 0; setp.leu.f32 %p287, %f216, 0f38D1B717; mov.f32 %f2197, 0f38D1B717; or.pred %p288, %p287, %p286; add.f32 %f2198, %f216, 0fB8D1B717; setp.leu.f32 %p289, %f2198, %f232; or.pred %p290, %p289, %p288; sub.f32 %f2199, %f2197, %f233; setp.geu.f32 %p291, %f2199, %f232; sqrt.rn.f32 %f240, %f239; or.pred %p292, %p291, %p290; @%p292 bra $L__BB0_206; mov.f32 %f2200, 0f3F800000; sub.f32 %f2201, %f2200, %f176; mul.f32 %f2202, %f2201, %f216; mul.f32 %f241, %f2202, 0f3F000000; add.f32 %f2203, %f231, %f241; fma.rn.f32 %f2204, %f240, 0fBFB504F3, 0f00000000; mul.f32 %f2205, %f2204, %f2204; fma.rn.f32 %f2206, %f2203, %f2203, %f2205; add.f32 %f2207, %f2206, 0f00000000; sqrt.rn.f32 %f2208, %f2207; div.rn.f32 %f242, %f2203, %f2208; div.rn.f32 %f2209, %f2204, %f2208; add.f32 %f2210, %f233, %f241; mul.f32 %f2211, %f237, %f2210; sub.f32 %f2212, %f241, %f216; mul.f32 %f2213, %f2212, %f2211; mul.f32 %f2214, %f237, %f242; add.f32 %f2215, %f241, %f241; sub.f32 %f2216, %f2215, %f216; add.f32 %f2217, %f233, %f2216; mul.f32 %f243, %f2217, %f2214; mul.f32 %f2218, %f235, %f2209; mul.f32 %f2219, %f2209, %f2218; fma.rn.f32 %f2220, %f242, %f2214, %f2219; mul.f32 %f2221, %f2220, 0fC0800000; mul.f32 %f2222, %f2213, %f2221; fma.rn.f32 %f2223, %f243, %f243, %f2222; sqrt.rn.f32 %f244, %f2223; sub.f32 %f2224, %f244, %f243; add.f32 %f245, %f2220, %f2220; div.rn.f32 %f2225, %f2224, %f245; fma.rn.f32 %f4841, %f242, %f2225, %f241; sub.f32 %f2226, %f232, %f241; sub.f32 %f2227, %f4841, %f241; mul.f32 %f2228, %f2226, %f2227; setp.gt.f32 %p293, %f2228, 0f00000000; @%p293 bra $L__BB0_202; neg.f32 %f2229, %f243; sub.f32 %f2230, %f2229, %f244; div.rn.f32 %f2231, %f2230, %f245; fma.rn.f32 %f4841, %f242, %f2231, %f241; $L__BB0_202: mul.f32 %f2232, %f4841, 0fC0000000; div.rn.f32 %f2233, %f2232, %f210; add.f32 %f2234, %f2233, 0f3F800000; abs.f32 %f2235, %f2234; sqrt.rn.f32 %f249, %f2235; setp.leu.f32 %p294, %f249, 0f38D1B717; @%p294 bra $L__BB0_206; div.rn.f32 %f2236, %f219, %f249; setp.lt.f32 %p295, %f2236, 0f00800000; mul.f32 %f2237, %f2236, 0f4B000000; selp.f32 %f250, %f2237, %f2236, %p295; selp.f32 %f2238, 0fC1B80000, 0f00000000, %p295; mov.b32 %r851, %f250; add.s32 %r852, %r851, -1059760811; and.b32 %r853, %r852, -8388608; sub.s32 %r854, %r851, %r853; mov.b32 %f2239, %r854; cvt.rn.f32.s32 %f2240, %r853; mov.f32 %f2241, 0f34000000; fma.rn.f32 %f2242, %f2240, %f2241, %f2238; add.f32 %f2243, %f2239, 0fBF800000; mov.f32 %f2244, 0f3E1039F6; mov.f32 %f2245, 0fBE055027; fma.rn.f32 %f2246, %f2245, %f2243, %f2244; mov.f32 %f2247, 0fBDF8CDCC; fma.rn.f32 %f2248, %f2246, %f2243, %f2247; mov.f32 %f2249, 0f3E0F2955; fma.rn.f32 %f2250, %f2248, %f2243, %f2249; mov.f32 %f2251, 0fBE2AD8B9; fma.rn.f32 %f2252, %f2250, %f2243, %f2251; mov.f32 %f2253, 0f3E4CED0B; fma.rn.f32 %f2254, %f2252, %f2243, %f2253; mov.f32 %f2255, 0fBE7FFF22; fma.rn.f32 %f2256, %f2254, %f2243, %f2255; mov.f32 %f2257, 0f3EAAAA78; fma.rn.f32 %f2258, %f2256, %f2243, %f2257; mov.f32 %f2259, 0fBF000000; fma.rn.f32 %f2260, %f2258, %f2243, %f2259; mul.f32 %f2261, %f2243, %f2260; fma.rn.f32 %f2262, %f2261, %f2243, %f2243; mov.f32 %f2263, 0f3F317218; fma.rn.f32 %f4842, %f2242, %f2263, %f2262; setp.lt.u32 %p296, %r851, 2139095040; @%p296 bra $L__BB0_205; mov.f32 %f2264, 0f7F800000; fma.rn.f32 %f4842, %f250, %f2264, %f2264; $L__BB0_205: setp.eq.f32 %p297, %f250, 0f00000000; selp.f32 %f2265, 0fFF800000, %f4842, %p297; add.f32 %f4851, %f4851, %f2265; $L__BB0_206: setp.eq.f32 %p722, %f219, 0f3F800000; neg.f32 %f2267, %f238; div.rn.f32 %f256, %f2267, %f236; mov.f32 %f2268, 0f3F000000; cvt.rzi.f32.f32 %f2269, %f2268; add.f32 %f2270, %f2269, %f2269; mov.f32 %f4844, 0f3F800000; sub.f32 %f2271, %f4844, %f2270; abs.f32 %f257, %f2271; mul.rn.f32 %f2272, %f221, %f4844; neg.f32 %f2273, %f2272; fma.rn.f32 %f2274, %f221, %f4844, %f2273; fma.rn.f32 %f2275, %f222, %f4844, %f2274; cvt.rni.f32.f32 %f2276, %f2272; sub.f32 %f2277, %f2272, %f2276; add.f32 %f2278, %f2275, %f2277; mov.f32 %f2279, 0f3AAF85ED; mov.f32 %f2280, 0f391FCB8E; fma.rn.f32 %f2281, %f2280, %f2278, %f2279; mov.f32 %f2282, 0f3C1D9856; fma.rn.f32 %f2283, %f2281, %f2278, %f2282; mov.f32 %f2284, 0f3D6357BB; fma.rn.f32 %f2285, %f2283, %f2278, %f2284; mov.f32 %f2286, 0f3E75FDEC; fma.rn.f32 %f2287, %f2285, %f2278, %f2286; mov.f32 %f2288, 0f3F317218; fma.rn.f32 %f2289, %f2287, %f2278, %f2288; fma.rn.f32 %f2290, %f2289, %f2278, %f4844; cvt.rzi.s32.f32 %r855, %f2276; setp.gt.f32 %p298, %f2276, 0f00000000; selp.b32 %r856, 0, -2097152000, %p298; add.s32 %r857, %r856, 2130706432; mov.b32 %f2291, %r857; mul.f32 %f2292, %f2290, %f2291; shl.b32 %r858, %r855, 23; sub.s32 %r859, %r858, %r856; mov.b32 %f2293, %r859; mul.f32 %f2294, %f2292, %f2293; abs.f32 %f2295, %f2272; setp.gt.f32 %p299, %f2295, 0f43180000; setp.lt.f32 %p300, %f2272, 0f00000000; selp.f32 %f2296, 0f00000000, 0f7F800000, %p300; selp.f32 %f258, %f2296, %f2294, %p299; @%p722 bra $L__BB0_213; abs.f32 %f4778, %f219; setp.gtu.f32 %p302, %f4778, 0f7F800000; @%p302 bra $L__BB0_212; bra.uni $L__BB0_208; $L__BB0_212: mov.f32 %f2299, 0f3F800000; add.rn.f32 %f4844, %f219, %f2299; bra.uni $L__BB0_213; $L__BB0_228: setp.eq.f32 %p326, %f275, 0f00000000; setp.eq.f32 %p327, %f276, 0f7F800000; or.pred %p328, %p326, %p327; @%p328 bra $L__BB0_231; bra.uni $L__BB0_229; $L__BB0_231: mov.f32 %f2533, 0f3E800000; cvt.rzi.f32.f32 %f2534, %f2533; add.f32 %f2535, %f2534, %f2534; mov.f32 %f2536, 0f3F000000; sub.f32 %f2537, %f2536, %f2535; abs.f32 %f2538, %f2537; setp.eq.f32 %p334, %f2538, 0f3F800000; add.f32 %f2539, %f275, %f275; mov.b32 %r912, %f2539; and.b32 %r913, %r912, 2147483647; selp.b32 %r914, %r912, %r913, %p334; mov.b32 %f4848, %r914; bra.uni $L__BB0_233; $L__BB0_252: mov.b32 %r143, %f307; shr.u32 %r953, %r143, 23; and.b32 %r954, %r953, 255; add.s32 %r144, %r954, -128; shl.b32 %r955, %r143, 8; or.b32 %r145, %r955, -2147483648; shr.u32 %r146, %r144, 5; mov.u32 %r1665, 0; mov.u64 %rd1159, __cudart_i2opi_f; mov.u64 %rd1160, %rd5; mov.u32 %r1666, %r1665; $L__BB0_253: .pragma "nounroll"; mov.u32 %r148, %r1666; ld.global.nc.u32 %r958, [%rd1159]; // begin inline asm { mad.lo.cc.u32 %r956, %r958, %r145, %r148; madc.hi.u32 %r1666, %r958, %r145, 0; } // end inline asm st.local.u32 [%rd1160], %r956; add.s64 %rd1160, %rd1160, 4; add.s64 %rd1159, %rd1159, 4; add.s32 %r1665, %r1665, 1; setp.ne.s32 %p365, %r1665, 6; @%p365 bra $L__BB0_253; mov.u32 %r963, -1560706194; // begin inline asm { mad.lo.cc.u32 %r961, %r963, %r145, %r148; madc.hi.u32 %r962, %r963, %r145, 0; } // end inline asm st.local.u32 [%rd111], %r962; mov.u32 %r966, 4; sub.s32 %r151, %r966, %r146; mov.u32 %r967, 6; sub.s32 %r968, %r967, %r146; mul.wide.s32 %rd674, %r968, 4; add.s64 %rd675, %rd5, %rd674; ld.local.u32 %r1667, [%rd675]; ld.local.u32 %r1668, [%rd675+-4]; and.b32 %r154, %r144, 31; setp.eq.s32 %p366, %r154, 0; @%p366 bra $L__BB0_256; mov.u32 %r969, 32; sub.s32 %r970, %r969, %r154; shr.u32 %r971, %r1668, %r970; shl.b32 %r972, %r1667, %r154; add.s32 %r1667, %r971, %r972; mul.wide.s32 %rd676, %r151, 4; add.s64 %rd677, %rd5, %rd676; ld.local.u32 %r973, [%rd677]; shr.u32 %r974, %r973, %r970; shl.b32 %r975, %r1668, %r154; add.s32 %r1668, %r974, %r975; $L__BB0_256: and.b32 %r976, %r143, -2147483648; shr.u32 %r977, %r1668, 30; shl.b32 %r978, %r1667, 2; or.b32 %r979, %r977, %r978; shr.u32 %r980, %r979, 31; shr.u32 %r981, %r1667, 30; add.s32 %r982, %r980, %r981; neg.s32 %r983, %r982; setp.eq.s32 %p367, %r976, 0; selp.b32 %r1669, %r982, %r983, %p367; setp.ne.s32 %p368, %r980, 0; xor.b32 %r984, %r976, -2147483648; selp.b32 %r985, %r984, %r976, %p368; selp.b32 %r986, -1, 0, %p368; xor.b32 %r987, %r979, %r986; shl.b32 %r988, %r1668, 2; xor.b32 %r989, %r988, %r986; cvt.u64.u32 %rd678, %r987; cvt.u64.u32 %rd679, %r989; bfi.b64 %rd680, %rd678, %rd679, 32, 32; cvt.rn.f64.s64 %fd13, %rd680; mul.f64 %fd14, %fd13, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f2674, %fd14; setp.eq.s32 %p369, %r985, 0; neg.f32 %f2675, %f2674; selp.f32 %f4854, %f2674, %f2675, %p369; $L__BB0_258: mul.f32 %f2677, %f308, 0f3F22F983; cvt.rni.s32.f32 %r1674, %f2677; cvt.rn.f32.s32 %f2678, %r1674; fma.rn.f32 %f2680, %f2678, %f2669, %f308; fma.rn.f32 %f2682, %f2678, %f2671, %f2680; fma.rn.f32 %f4855, %f2678, %f2673, %f2682; abs.f32 %f315, %f308; setp.ltu.f32 %p370, %f315, 0f47CE4780; @%p370 bra $L__BB0_266; setp.eq.f32 %p371, %f315, 0f7F800000; @%p371 bra $L__BB0_265; bra.uni $L__BB0_260; $L__BB0_265: mov.f32 %f2686, 0f00000000; mul.rn.f32 %f4855, %f308, %f2686; mov.u32 %r1674, 0; bra.uni $L__BB0_266; $L__BB0_260: mov.b32 %r162, %f308; shr.u32 %r993, %r162, 23; and.b32 %r994, %r993, 255; add.s32 %r163, %r994, -128; shl.b32 %r995, %r162, 8; or.b32 %r164, %r995, -2147483648; shr.u32 %r165, %r163, 5; mov.u32 %r1670, 0; mov.u64 %rd1161, __cudart_i2opi_f; mov.u64 %rd1162, %rd5; mov.u32 %r1671, %r1670; $L__BB0_261: .pragma "nounroll"; mov.u32 %r167, %r1671; ld.global.nc.u32 %r998, [%rd1161]; // begin inline asm { mad.lo.cc.u32 %r996, %r998, %r164, %r167; madc.hi.u32 %r1671, %r998, %r164, 0; } // end inline asm st.local.u32 [%rd1162], %r996; add.s64 %rd1162, %rd1162, 4; add.s64 %rd1161, %rd1161, 4; add.s32 %r1670, %r1670, 1; setp.ne.s32 %p372, %r1670, 6; @%p372 bra $L__BB0_261; mov.u32 %r1003, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1001, %r1003, %r164, %r167; madc.hi.u32 %r1002, %r1003, %r164, 0; } // end inline asm st.local.u32 [%rd111], %r1002; mov.u32 %r1006, 4; sub.s32 %r170, %r1006, %r165; mov.u32 %r1007, 6; sub.s32 %r1008, %r1007, %r165; mul.wide.s32 %rd682, %r1008, 4; add.s64 %rd683, %rd5, %rd682; ld.local.u32 %r1672, [%rd683]; ld.local.u32 %r1673, [%rd683+-4]; and.b32 %r173, %r163, 31; setp.eq.s32 %p373, %r173, 0; @%p373 bra $L__BB0_264; mov.u32 %r1009, 32; sub.s32 %r1010, %r1009, %r173; shr.u32 %r1011, %r1673, %r1010; shl.b32 %r1012, %r1672, %r173; add.s32 %r1672, %r1011, %r1012; mul.wide.s32 %rd684, %r170, 4; add.s64 %rd685, %rd5, %rd684; ld.local.u32 %r1013, [%rd685]; shr.u32 %r1014, %r1013, %r1010; shl.b32 %r1015, %r1673, %r173; add.s32 %r1673, %r1014, %r1015; $L__BB0_264: and.b32 %r1016, %r162, -2147483648; shr.u32 %r1017, %r1673, 30; shl.b32 %r1018, %r1672, 2; or.b32 %r1019, %r1017, %r1018; shr.u32 %r1020, %r1019, 31; shr.u32 %r1021, %r1672, 30; add.s32 %r1022, %r1020, %r1021; neg.s32 %r1023, %r1022; setp.eq.s32 %p374, %r1016, 0; selp.b32 %r1674, %r1022, %r1023, %p374; setp.ne.s32 %p375, %r1020, 0; xor.b32 %r1024, %r1016, -2147483648; selp.b32 %r1025, %r1024, %r1016, %p375; selp.b32 %r1026, -1, 0, %p375; xor.b32 %r1027, %r1019, %r1026; shl.b32 %r1028, %r1673, 2; xor.b32 %r1029, %r1028, %r1026; cvt.u64.u32 %rd686, %r1027; cvt.u64.u32 %rd687, %r1029; bfi.b64 %rd688, %rd686, %rd687, 32, 32; cvt.rn.f64.s64 %fd15, %rd688; mul.f64 %fd16, %fd15, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f2684, %fd16; setp.eq.s32 %p376, %r1025, 0; neg.f32 %f2685, %f2684; selp.f32 %f4855, %f2684, %f2685, %p376; $L__BB0_266: mul.f32 %f2687, %f4854, %f4854; mov.f32 %f2688, 0fBAB607ED; mov.f32 %f2689, 0f37CBAC00; fma.rn.f32 %f2690, %f2689, %f2687, %f2688; mov.f32 %f2691, 0f3D2AAABB; fma.rn.f32 %f2692, %f2690, %f2687, %f2691; mov.f32 %f2693, 0fBEFFFFFF; fma.rn.f32 %f2694, %f2692, %f2687, %f2693; mov.f32 %f2695, 0f3F800000; fma.rn.f32 %f2696, %f2694, %f2687, %f2695; mov.f32 %f2697, 0f3C0885E4; mov.f32 %f4857, 0fB94D4153; fma.rn.f32 %f2699, %f4857, %f2687, %f2697; mov.f32 %f2700, 0fBE2AAAA8; fma.rn.f32 %f2701, %f2699, %f2687, %f2700; mov.f32 %f2702, 0f00000000; fma.rn.f32 %f2703, %f2687, %f4854, %f2702; fma.rn.f32 %f2704, %f2701, %f2703, %f4854; and.b32 %r1031, %r1669, 1; setp.eq.b32 %p377, %r1031, 1; selp.f32 %f2705, %f2696, %f2704, %p377; selp.f32 %f2706, %f2704, %f2696, %p377; neg.f32 %f2707, %f2705; and.b32 %r1032, %r1669, 2; setp.eq.s32 %p378, %r1032, 0; selp.f32 %f2708, %f2705, %f2707, %p378; neg.f32 %f2709, %f2706; add.s32 %r1033, %r1669, 1; and.b32 %r1034, %r1033, 2; setp.eq.s32 %p379, %r1034, 0; selp.f32 %f2710, %f2706, %f2709, %p379; mul.f32 %f2711, %f4855, %f4855; fma.rn.f32 %f2712, %f2689, %f2711, %f2688; fma.rn.f32 %f2713, %f2712, %f2711, %f2691; fma.rn.f32 %f2714, %f2713, %f2711, %f2693; fma.rn.f32 %f2715, %f2714, %f2711, %f2695; fma.rn.f32 %f2716, %f2711, %f4855, %f2702; fma.rn.f32 %f2717, %f4857, %f2711, %f2697; fma.rn.f32 %f2718, %f2717, %f2711, %f2700; fma.rn.f32 %f2719, %f2718, %f2716, %f4855; and.b32 %r1035, %r1674, 1; setp.eq.b32 %p380, %r1035, 1; selp.f32 %f2720, %f2715, %f2719, %p380; selp.f32 %f2721, %f2719, %f2715, %p380; and.b32 %r1036, %r1674, 2; setp.eq.s32 %p381, %r1036, 0; neg.f32 %f2722, %f2720; selp.f32 %f2723, %f2720, %f2722, %p381; add.s32 %r1037, %r1674, 1; and.b32 %r1038, %r1037, 2; setp.eq.s32 %p382, %r1038, 0; neg.f32 %f2724, %f2721; selp.f32 %f2725, %f2721, %f2724, %p382; mov.b32 %r1039, %f2725; neg.f32 %f2726, %f2723; mov.b32 %r1040, %f2723; cvt.u64.u32 %rd689, %r1040; cvt.u64.u32 %rd690, %r1039; bfi.b64 %rd176, %rd689, %rd690, 32, 32; mov.b32 %r1041, %f2726; cvt.u64.u32 %rd691, %r1041; bfi.b64 %rd177, %rd690, %rd691, 32, 32; mul.f32 %f2727, %f293, %f2708; mov.b32 %r1042, %f2727; cvt.u64.u32 %rd692, %r1042; mov.b32 %r1043, %f2710; cvt.u64.u32 %rd693, %r1043; bfi.b64 %rd178, %rd692, %rd693, 32, 32; neg.f32 %f2728, %f2708; mov.b32 %r1044, %f2728; mul.f32 %f2729, %f293, %f2710; mov.b32 %r1045, %f2729; cvt.u64.u32 %rd694, %r1045; cvt.u64.u32 %rd695, %r1044; bfi.b64 %rd179, %rd694, %rd695, 32, 32; ld.global.f32 %f2730, [%rd81+44]; ld.f32 %f2731, [%rd167]; mul.f32 %f2732, %f2731, %f2730; ld.global.f32 %f2733, [%rd81+52]; sub.f32 %f2734, %f2732, %f2733; ld.global.f32 %f2735, [%rd81+48]; mul.f32 %f2736, %f2731, %f2735; neg.f32 %f2737, %f2736; mov.f32 %f2738, 0f3F000000; mov.f32 %f2739, 0f3BBB989D; fma.rn.f32 %f2740, %f2737, %f2739, %f2738; mov.f32 %f2741, 0f3FB8AA3B; mov.f32 %f2742, 0f437C0000; cvt.sat.f32.f32 %f2743, %f2740; mov.f32 %f2744, 0f4B400001; fma.rm.f32 %f2745, %f2743, %f2742, %f2744; add.f32 %f2746, %f2745, 0fCB40007F; neg.f32 %f2747, %f2746; fma.rn.f32 %f2748, %f2737, %f2741, %f2747; mov.f32 %f2749, 0f32A57060; fma.rn.f32 %f2750, %f2737, %f2749, %f2748; mov.b32 %r1046, %f2745; shl.b32 %r1047, %r1046, 23; mov.b32 %f2751, %r1047; ex2.approx.ftz.f32 %f2752, %f2750; mul.f32 %f2753, %f2752, %f2751; ld.global.f32 %f2754, [%rd81+40]; fma.rn.f32 %f319, %f2734, %f2753, %f2754; mul.f32 %f2755, %f319, 0f3F22F983; cvt.rni.s32.f32 %r1679, %f2755; cvt.rn.f32.s32 %f2756, %r1679; mov.f32 %f2757, 0fBFC90FDA; fma.rn.f32 %f2758, %f2756, %f2757, %f319; mov.f32 %f2759, 0fB3A22168; fma.rn.f32 %f2760, %f2756, %f2759, %f2758; mov.f32 %f2761, 0fA7C234C5; fma.rn.f32 %f4856, %f2756, %f2761, %f2760; abs.f32 %f321, %f319; setp.ltu.f32 %p383, %f321, 0f47CE4780; @%p383 bra $L__BB0_274; setp.eq.f32 %p384, %f321, 0f7F800000; @%p384 bra $L__BB0_273; bra.uni $L__BB0_268; $L__BB0_273: mov.f32 %f2764, 0f00000000; mul.rn.f32 %f4856, %f319, %f2764; mov.u32 %r1679, 0; bra.uni $L__BB0_274; $L__BB0_268: mov.b32 %r181, %f319; shr.u32 %r1050, %r181, 23; and.b32 %r1051, %r1050, 255; add.s32 %r182, %r1051, -128; shl.b32 %r1052, %r181, 8; or.b32 %r183, %r1052, -2147483648; shr.u32 %r184, %r182, 5; mov.u32 %r1675, 0; mov.u64 %rd1163, __cudart_i2opi_f; mov.u64 %rd1164, %rd5; mov.u32 %r1676, %r1675; $L__BB0_269: .pragma "nounroll"; mov.u32 %r186, %r1676; ld.global.nc.u32 %r1055, [%rd1163]; // begin inline asm { mad.lo.cc.u32 %r1053, %r1055, %r183, %r186; madc.hi.u32 %r1676, %r1055, %r183, 0; } // end inline asm st.local.u32 [%rd1164], %r1053; add.s64 %rd1164, %rd1164, 4; add.s64 %rd1163, %rd1163, 4; add.s32 %r1675, %r1675, 1; setp.ne.s32 %p385, %r1675, 6; @%p385 bra $L__BB0_269; mov.u32 %r1060, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1058, %r1060, %r183, %r186; madc.hi.u32 %r1059, %r1060, %r183, 0; } // end inline asm st.local.u32 [%rd111], %r1059; mov.u32 %r1063, 4; sub.s32 %r189, %r1063, %r184; mov.u32 %r1064, 6; sub.s32 %r1065, %r1064, %r184; mul.wide.s32 %rd697, %r1065, 4; add.s64 %rd698, %rd5, %rd697; ld.local.u32 %r1677, [%rd698]; ld.local.u32 %r1678, [%rd698+-4]; and.b32 %r192, %r182, 31; setp.eq.s32 %p386, %r192, 0; @%p386 bra $L__BB0_272; mov.u32 %r1066, 32; sub.s32 %r1067, %r1066, %r192; shr.u32 %r1068, %r1678, %r1067; shl.b32 %r1069, %r1677, %r192; add.s32 %r1677, %r1068, %r1069; mul.wide.s32 %rd699, %r189, 4; add.s64 %rd700, %rd5, %rd699; ld.local.u32 %r1070, [%rd700]; shr.u32 %r1071, %r1070, %r1067; shl.b32 %r1072, %r1678, %r192; add.s32 %r1678, %r1071, %r1072; $L__BB0_272: and.b32 %r1073, %r181, -2147483648; shr.u32 %r1074, %r1678, 30; shl.b32 %r1075, %r1677, 2; or.b32 %r1076, %r1074, %r1075; shr.u32 %r1077, %r1076, 31; shr.u32 %r1078, %r1677, 30; add.s32 %r1079, %r1077, %r1078; neg.s32 %r1080, %r1079; setp.eq.s32 %p387, %r1073, 0; selp.b32 %r1679, %r1079, %r1080, %p387; setp.ne.s32 %p388, %r1077, 0; xor.b32 %r1081, %r1073, -2147483648; selp.b32 %r1082, %r1081, %r1073, %p388; selp.b32 %r1083, -1, 0, %p388; xor.b32 %r1084, %r1076, %r1083; shl.b32 %r1085, %r1678, 2; xor.b32 %r1086, %r1085, %r1083; cvt.u64.u32 %rd701, %r1084; cvt.u64.u32 %rd702, %r1086; bfi.b64 %rd703, %rd701, %rd702, 32, 32; cvt.rn.f64.s64 %fd17, %rd703; mul.f64 %fd18, %fd17, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f2762, %fd18; setp.eq.s32 %p389, %r1082, 0; neg.f32 %f2763, %f2762; selp.f32 %f4856, %f2762, %f2763, %p389; $L__BB0_274: and.b32 %r199, %r1679, 1; setp.eq.s32 %p390, %r199, 0; selp.f32 %f325, %f4856, 0f3F800000, %p390; mul.rn.f32 %f326, %f4856, %f4856; @%p390 bra $L__BB0_276; mov.f32 %f2766, 0fBAB607ED; mov.f32 %f2767, 0f37CBAC00; fma.rn.f32 %f4857, %f2767, %f326, %f2766; $L__BB0_276: selp.f32 %f2768, 0f3C0885E4, 0f3D2AAABB, %p390; fma.rn.f32 %f2769, %f4857, %f326, %f2768; selp.f32 %f2770, 0fBE2AAAA8, 0fBEFFFFFF, %p390; fma.rn.f32 %f2771, %f2769, %f326, %f2770; mov.f32 %f2772, 0f00000000; fma.rn.f32 %f2773, %f326, %f325, %f2772; fma.rn.f32 %f4858, %f2771, %f2773, %f325; and.b32 %r1088, %r1679, 2; setp.eq.s32 %p392, %r1088, 0; @%p392 bra $L__BB0_278; mov.f32 %f2775, 0fBF800000; fma.rn.f32 %f4858, %f4858, %f2775, %f2772; $L__BB0_278: ld.f32 %f332, [%rd167+8]; mul.f32 %f2776, %f292, 0f4B000000; setp.lt.f32 %p393, %f292, 0f00800000; selp.f32 %f333, %f2776, %f292, %p393; selp.f32 %f2777, 0fC1B80000, 0f00000000, %p393; mov.b32 %r1089, %f333; add.s32 %r1090, %r1089, -1059760811; and.b32 %r1091, %r1090, -8388608; sub.s32 %r1092, %r1089, %r1091; mov.b32 %f2778, %r1092; cvt.rn.f32.s32 %f2779, %r1091; mov.f32 %f2780, 0f34000000; fma.rn.f32 %f2781, %f2779, %f2780, %f2777; add.f32 %f2782, %f2778, 0fBF800000; mov.f32 %f2783, 0f3E1039F6; mov.f32 %f2784, 0fBE055027; fma.rn.f32 %f2785, %f2784, %f2782, %f2783; mov.f32 %f2786, 0fBDF8CDCC; fma.rn.f32 %f2787, %f2785, %f2782, %f2786; mov.f32 %f2788, 0f3E0F2955; fma.rn.f32 %f2789, %f2787, %f2782, %f2788; mov.f32 %f2790, 0fBE2AD8B9; fma.rn.f32 %f2791, %f2789, %f2782, %f2790; mov.f32 %f2792, 0f3E4CED0B; fma.rn.f32 %f2793, %f2791, %f2782, %f2792; mov.f32 %f2794, 0fBE7FFF22; fma.rn.f32 %f2795, %f2793, %f2782, %f2794; mov.f32 %f2796, 0f3EAAAA78; fma.rn.f32 %f2797, %f2795, %f2782, %f2796; mov.f32 %f2798, 0fBF000000; fma.rn.f32 %f2799, %f2797, %f2782, %f2798; mul.f32 %f2800, %f2782, %f2799; fma.rn.f32 %f2801, %f2800, %f2782, %f2782; mov.f32 %f2802, 0f3F317218; fma.rn.f32 %f4859, %f2781, %f2802, %f2801; setp.lt.u32 %p394, %r1089, 2139095040; @%p394 bra $L__BB0_280; mov.f32 %f2803, 0f7F800000; fma.rn.f32 %f4859, %f333, %f2803, %f2803; $L__BB0_280: setp.eq.f32 %p395, %f333, 0f00000000; selp.f32 %f337, 0fFF800000, %f4859, %p395; mul.f32 %f2804, %f294, 0f4B000000; setp.lt.f32 %p396, %f294, 0f00800000; selp.f32 %f338, %f2804, %f294, %p396; selp.f32 %f2805, 0fC1B80000, 0f00000000, %p396; mov.b32 %r1093, %f338; add.s32 %r1094, %r1093, -1059760811; and.b32 %r1095, %r1094, -8388608; sub.s32 %r1096, %r1093, %r1095; mov.b32 %f2806, %r1096; cvt.rn.f32.s32 %f2807, %r1095; fma.rn.f32 %f2809, %f2807, %f2780, %f2805; add.f32 %f2810, %f2806, 0fBF800000; fma.rn.f32 %f2813, %f2784, %f2810, %f2783; fma.rn.f32 %f2815, %f2813, %f2810, %f2786; fma.rn.f32 %f2817, %f2815, %f2810, %f2788; fma.rn.f32 %f2819, %f2817, %f2810, %f2790; fma.rn.f32 %f2821, %f2819, %f2810, %f2792; fma.rn.f32 %f2823, %f2821, %f2810, %f2794; fma.rn.f32 %f2825, %f2823, %f2810, %f2796; fma.rn.f32 %f2827, %f2825, %f2810, %f2798; mul.f32 %f2828, %f2810, %f2827; fma.rn.f32 %f2829, %f2828, %f2810, %f2810; fma.rn.f32 %f4860, %f2809, %f2802, %f2829; setp.lt.u32 %p397, %r1093, 2139095040; @%p397 bra $L__BB0_282; mov.f32 %f2831, 0f7F800000; fma.rn.f32 %f4860, %f338, %f2831, %f2831; $L__BB0_282: add.u64 %rd1168, %SP, 32; add.u64 %rd1174, %SP, 64; setp.eq.f32 %p398, %f338, 0f00000000; selp.f32 %f2832, 0fFF800000, %f4860, %p398; fma.rn.f32 %f342, %f332, 0f3F000000, %f337; fma.rn.f32 %f343, %f332, 0f3F000000, %f2832; add.f32 %f2833, %f342, 0f00000000; add.f32 %f344, %f2833, %f343; mul.f32 %f2834, %f344, 0f3F000000; sub.f32 %f345, %f342, %f2834; mov.b32 %r1097, %f345; sub.f32 %f346, %f343, %f2834; mov.b32 %r1098, %f346; cvt.u64.u32 %rd705, %r1098; cvt.u64.u32 %rd706, %r1097; bfi.b64 %rd707, %rd705, %rd706, 32, 32; mov.u64 %rd708, 0; st.local.u64 [%rd1166], %rd707; st.local.u64 [%rd1172], %rd708; add.s64 %rd1165, %rd1166, 8; add.s64 %rd1178, %rd1172, 8; add.f32 %f2835, %f4858, %f4858; mul.f32 %f2836, %f2835, 0f3F5105EC; mov.f32 %f2837, 0f40400000; sub.f32 %f2838, %f2837, %f4858; div.rn.f32 %f347, %f2836, %f2838; mov.u64 %rd1179, 2; mov.u64 %rd1167, %rd1166; mov.u64 %rd1169, %rd1166; mov.u64 %rd1170, %rd1166; mov.u64 %rd1171, %rd1168; mov.u64 %rd1173, %rd1172; mov.u64 %rd1175, %rd1172; mov.u64 %rd1176, %rd1172; mov.u64 %rd1177, %rd1174; $L__BB0_283: setp.eq.s64 %p399, %rd1179, 0; @%p399 bra $L__BB0_289; add.s64 %rd1179, %rd1179, -1; add.s64 %rd709, %rd1166, 8; setp.eq.s64 %p400, %rd1169, %rd1165; selp.b64 %rd710, %rd709, %rd1169, %p400; add.s64 %rd711, %rd1167, 8; selp.b64 %rd712, %rd711, %rd1170, %p400; add.s64 %rd713, %rd1168, 8; selp.b64 %rd714, %rd713, %rd1171, %p400; setp.eq.s64 %p401, %rd1179, 0; add.s64 %rd715, %rd710, 4; add.s64 %rd716, %rd712, 4; add.s64 %rd717, %rd714, 4; selp.b64 %rd206, %rd710, %rd715, %p401; selp.b64 %rd1170, %rd712, %rd716, %p401; selp.b64 %rd1171, %rd714, %rd717, %p401; selp.b64 %rd1166, %rd709, %rd1166, %p400; selp.b64 %rd1167, %rd711, %rd1167, %p400; selp.b64 %rd1168, %rd713, %rd1168, %p400; add.s64 %rd718, %rd1169, 8; selp.b64 %rd1165, %rd718, %rd1165, %p400; add.s64 %rd719, %rd1175, 8; setp.eq.s64 %p402, %rd1172, %rd1178; selp.b64 %rd720, %rd719, %rd1172, %p402; add.s64 %rd721, %rd1176, 8; selp.b64 %rd722, %rd721, %rd1173, %p402; add.s64 %rd723, %rd1177, 8; selp.b64 %rd724, %rd723, %rd1174, %p402; selp.b64 %rd1175, %rd719, %rd1175, %p402; selp.b64 %rd1176, %rd721, %rd1176, %p402; selp.b64 %rd1177, %rd723, %rd1177, %p402; add.s64 %rd725, %rd1172, 8; selp.b64 %rd1178, %rd725, %rd1178, %p402; add.s64 %rd726, %rd720, 4; add.s64 %rd727, %rd722, 4; add.s64 %rd728, %rd724, 4; selp.b64 %rd1172, %rd720, %rd726, %p401; selp.b64 %rd1173, %rd722, %rd727, %p401; selp.b64 %rd1174, %rd724, %rd728, %p401; ld.local.f32 %f2839, [%rd722]; ld.local.f32 %f2840, [%rd712]; setp.eq.f32 %p403, %f2840, %f2839; mov.u64 %rd1169, %rd206; @%p403 bra $L__BB0_283; setp.gt.f32 %p404, %f344, 0f00000000; @%p404 bra $L__BB0_289; bra.uni $L__BB0_286; $L__BB0_289: mul.f32 %f2882, %f343, %f343; fma.rn.f32 %f2883, %f342, %f342, %f2882; add.f32 %f2884, %f2883, 0f00000000; sqrt.rn.f32 %f2885, %f2884; mov.b32 %r1683, %f2885; mov.u32 %r1682, 1; $L__BB0_290: mov.u64 %rd1180, 4575657222473777152; $L__BB0_291: mov.u64 %rd1041, 0; cvt.u64.u32 %rd733, %r1683; or.b64 %rd734, %rd733, %rd1041; shl.b64 %rd735, %rd734, 32; shr.u64 %rd736, %rd1180, 32; or.b64 %rd225, %rd735, %rd736; setp.eq.s32 %p406, %r1682, 0; @%p406 bra $L__BB0_297; mov.b64 {%r1109, %r1110}, %rd225; mov.b32 %f350, %r1110; cvt.u32.u64 %r1111, %rd736; cvt.u32.u64 %r1112, %rd1180; mov.b32 %f351, %r1112; mov.b32 %f352, %r1111; mul.f32 %f2886, %f351, %f352; mul.f32 %f2887, %f292, %f294; sub.f32 %f2888, %f2886, %f2887; setp.gt.f32 %p407, %f2888, 0f00000000; ld.global.f32 %f2889, [%rd81+68]; fma.rn.f32 %f2890, %f2888, %f2889, %f2887; selp.f32 %f353, %f2886, %f2890, %p407; div.rn.f32 %f2891, %f2887, %f353; mul.f32 %f4865, %f4865, %f2891; setp.lt.f32 %p408, %f2887, 0f00800000; mul.f32 %f2892, %f2887, 0f4B000000; selp.f32 %f355, %f2892, %f2887, %p408; selp.f32 %f2893, 0fC1B80000, 0f00000000, %p408; mov.b32 %r1113, %f355; add.s32 %r1114, %r1113, -1059760811; and.b32 %r1115, %r1114, -8388608; sub.s32 %r1116, %r1113, %r1115; mov.b32 %f2894, %r1116; cvt.rn.f32.s32 %f2895, %r1115; mov.f32 %f2896, 0f34000000; fma.rn.f32 %f2897, %f2895, %f2896, %f2893; add.f32 %f2898, %f2894, 0fBF800000; mov.f32 %f2899, 0f3E1039F6; mov.f32 %f2900, 0fBE055027; fma.rn.f32 %f2901, %f2900, %f2898, %f2899; mov.f32 %f2902, 0fBDF8CDCC; fma.rn.f32 %f2903, %f2901, %f2898, %f2902; mov.f32 %f2904, 0f3E0F2955; fma.rn.f32 %f2905, %f2903, %f2898, %f2904; mov.f32 %f2906, 0fBE2AD8B9; fma.rn.f32 %f2907, %f2905, %f2898, %f2906; mov.f32 %f2908, 0f3E4CED0B; fma.rn.f32 %f2909, %f2907, %f2898, %f2908; mov.f32 %f2910, 0fBE7FFF22; fma.rn.f32 %f2911, %f2909, %f2898, %f2910; mov.f32 %f2912, 0f3EAAAA78; fma.rn.f32 %f2913, %f2911, %f2898, %f2912; mov.f32 %f2914, 0fBF000000; fma.rn.f32 %f2915, %f2913, %f2898, %f2914; mul.f32 %f2916, %f2898, %f2915; fma.rn.f32 %f2917, %f2916, %f2898, %f2898; mov.f32 %f2918, 0f3F317218; fma.rn.f32 %f4861, %f2897, %f2918, %f2917; setp.lt.u32 %p409, %r1113, 2139095040; @%p409 bra $L__BB0_294; mov.f32 %f2919, 0f7F800000; fma.rn.f32 %f4861, %f355, %f2919, %f2919; $L__BB0_294: setp.eq.f32 %p410, %f355, 0f00000000; selp.f32 %f359, 0fFF800000, %f4861, %p410; mul.f32 %f2920, %f353, 0f4B000000; setp.lt.f32 %p411, %f353, 0f00800000; selp.f32 %f360, %f2920, %f353, %p411; selp.f32 %f2921, 0fC1B80000, 0f00000000, %p411; mov.b32 %r1117, %f360; add.s32 %r1118, %r1117, -1059760811; and.b32 %r1119, %r1118, -8388608; sub.s32 %r1120, %r1117, %r1119; mov.b32 %f2922, %r1120; cvt.rn.f32.s32 %f2923, %r1119; fma.rn.f32 %f2925, %f2923, %f2896, %f2921; add.f32 %f2926, %f2922, 0fBF800000; fma.rn.f32 %f2929, %f2900, %f2926, %f2899; fma.rn.f32 %f2931, %f2929, %f2926, %f2902; fma.rn.f32 %f2933, %f2931, %f2926, %f2904; fma.rn.f32 %f2935, %f2933, %f2926, %f2906; fma.rn.f32 %f2937, %f2935, %f2926, %f2908; fma.rn.f32 %f2939, %f2937, %f2926, %f2910; fma.rn.f32 %f2941, %f2939, %f2926, %f2912; fma.rn.f32 %f2943, %f2941, %f2926, %f2914; mul.f32 %f2944, %f2926, %f2943; fma.rn.f32 %f2945, %f2944, %f2926, %f2926; fma.rn.f32 %f4862, %f2925, %f2918, %f2945; setp.lt.u32 %p412, %r1117, 2139095040; @%p412 bra $L__BB0_296; mov.f32 %f2947, 0f7F800000; fma.rn.f32 %f4862, %f360, %f2947, %f2947; $L__BB0_296: setp.eq.f32 %p413, %f360, 0f00000000; selp.f32 %f2948, 0fFF800000, %f4862, %p413; sub.f32 %f2949, %f359, %f2948; ld.f32 %f2950, [%rd167+8]; add.f32 %f2951, %f2950, %f2949; st.f32 [%rd167+8], %f2951; ld.f32 %f2952, [%rd167]; add.f32 %f2953, %f350, %f2952; st.f32 [%rd167], %f2953; mov.b64 {%r1121, %r1122}, %rd177; mov.b64 {%r1123, %r1124}, %rd176; mov.b32 %f2954, %r1123; mul.f32 %f2955, %f2954, %f351; mov.b32 %f2956, %r1124; mul.f32 %f2957, %f2956, %f351; mov.b32 %f2958, %r1121; mul.f32 %f2959, %f2958, %f352; mov.b32 %f2960, %r1122; mul.f32 %f2961, %f2960, %f352; mov.b64 {%r1125, %r1126}, %rd179; mov.b64 {%r1127, %r1128}, %rd178; mov.b32 %f2962, %r1127; mov.b32 %f2963, %r1128; mul.f32 %f2964, %f2963, %f2959; mul.f32 %f2965, %f2963, %f2961; fma.rn.f32 %f4864, %f2962, %f2957, %f2965; mov.b32 %f2966, %r1125; mov.b32 %f2967, %r1126; mul.f32 %f2968, %f2967, %f2959; fma.rn.f32 %f4863, %f2966, %f2955, %f2968; mul.f32 %f2969, %f2967, %f2961; fma.rn.f32 %f2970, %f2966, %f2957, %f2969; fma.rn.f32 %f2971, %f2962, %f2955, %f2964; st.local.v4.f32 [%rd77], {%f2971, %f4864, %f4863, %f2970}; bra.uni $L__BB0_297; $L__BB0_286: mul.f32 %f2841, %f346, %f346; fma.rn.f32 %f2842, %f345, %f345, %f2841; add.f32 %f2843, %f2842, 0f00000000; sqrt.rn.f32 %f348, %f2843; ld.global.f32 %f2844, [%rd81+56]; ld.global.f32 %f2845, [%rd81+60]; add.f32 %f2846, %f2845, %f2845; fma.rn.f32 %f2847, %f2844, 0f40000000, %f2846; div.rn.f32 %f2848, %f2847, %f2846; mul.f32 %f2849, %f344, %f2848; fma.rn.f32 %f349, %f347, %f2849, %f348; setp.le.f32 %p405, %f349, 0f00000000; mov.u32 %r1682, 0; @%p405 bra $L__BB0_290; div.rn.f32 %f2850, %f345, %f348; mul.f32 %f2851, %f349, %f2850; div.rn.f32 %f2852, %f346, %f348; mul.f32 %f2853, %f349, %f2852; sub.f32 %f2854, %f342, %f2851; sub.f32 %f2855, %f343, %f2853; mov.f32 %f2856, 0f3F000000; mov.f32 %f2857, 0f3BBB989D; fma.rn.f32 %f2858, %f2854, %f2857, %f2856; mov.f32 %f2859, 0f3FB8AA3B; mov.f32 %f2860, 0f437C0000; cvt.sat.f32.f32 %f2861, %f2858; mov.f32 %f2862, 0f4B400001; fma.rm.f32 %f2863, %f2861, %f2860, %f2862; add.f32 %f2864, %f2863, 0fCB40007F; neg.f32 %f2865, %f2864; fma.rn.f32 %f2866, %f2854, %f2859, %f2865; mov.f32 %f2867, 0f32A57060; fma.rn.f32 %f2868, %f2854, %f2867, %f2866; mov.b32 %r1102, %f2863; shl.b32 %r1103, %r1102, 23; mov.b32 %f2869, %r1103; ex2.approx.ftz.f32 %f2870, %f2868; mul.f32 %f2871, %f2870, %f2869; fma.rn.f32 %f2872, %f2855, %f2857, %f2856; cvt.sat.f32.f32 %f2873, %f2872; fma.rm.f32 %f2874, %f2873, %f2860, %f2862; add.f32 %f2875, %f2874, 0fCB40007F; neg.f32 %f2876, %f2875; fma.rn.f32 %f2877, %f2855, %f2859, %f2876; fma.rn.f32 %f2878, %f2855, %f2867, %f2877; mov.b32 %r1104, %f2874; shl.b32 %r1105, %r1104, 23; mov.b32 %f2879, %r1105; ex2.approx.ftz.f32 %f2880, %f2878; mul.f32 %f2881, %f2880, %f2879; mov.b32 %r1106, %f2871; mov.b32 %r1107, %f2881; cvt.u64.u32 %rd729, %r1107; cvt.u64.u32 %rd730, %r1106; mov.b32 %r1683, %f349; bfi.b64 %rd1180, %rd729, %rd730, 32, 32; mov.u32 %r1682, 1; bra.uni $L__BB0_291; $L__BB0_307: setp.geu.f32 %p431, %f380, 0f00000000; mov.f32 %f4870, %f389; @%p431 bra $L__BB0_311; setp.eq.f32 %p432, %f387, 0f3F800000; neg.f32 %f3058, %f389; selp.f32 %f4870, %f3058, %f389, %p432; $L__BB0_311: add.f32 %f3061, %f385, 0f00000000; add.f32 %f3062, %f3061, %f386; fma.rn.f32 %f3063, %f3062, %f4870, 0fC0000000; mul.f32 %f3064, %f383, %f384; mul.f32 %f3065, %f3064, 0f3F000000; mul.f32 %f394, %f3065, %f3063; setp.lt.f32 %p434, %f380, 0f3F800000; @%p434 bra $L__BB0_315; bra.uni $L__BB0_312; $L__BB0_315: mul.f32 %f3103, %f4874, 0f3F7FBE77; mul.f32 %f4873, %f3103, %f4874; mov.f32 %f4872, 0f3A83126F; mov.f32 %f4874, %f394; bra.uni $L__BB0_316; $L__BB0_312: ld.global.f32 %f395, [%rd81+12]; mul.f32 %f3066, %f380, 0f4B000000; setp.lt.f32 %p435, %f380, 0f00800000; selp.f32 %f396, %f3066, %f380, %p435; selp.f32 %f3067, 0fC1B80000, 0f00000000, %p435; mov.b32 %r1142, %f396; add.s32 %r1143, %r1142, -1059760811; and.b32 %r1144, %r1143, -8388608; sub.s32 %r1145, %r1142, %r1144; mov.b32 %f3068, %r1145; cvt.rn.f32.s32 %f3069, %r1144; mov.f32 %f3070, 0f34000000; fma.rn.f32 %f3071, %f3069, %f3070, %f3067; add.f32 %f3072, %f3068, 0fBF800000; mov.f32 %f3073, 0f3E1039F6; mov.f32 %f3074, 0fBE055027; fma.rn.f32 %f3075, %f3074, %f3072, %f3073; mov.f32 %f3076, 0fBDF8CDCC; fma.rn.f32 %f3077, %f3075, %f3072, %f3076; mov.f32 %f3078, 0f3E0F2955; fma.rn.f32 %f3079, %f3077, %f3072, %f3078; mov.f32 %f3080, 0fBE2AD8B9; fma.rn.f32 %f3081, %f3079, %f3072, %f3080; mov.f32 %f3082, 0f3E4CED0B; fma.rn.f32 %f3083, %f3081, %f3072, %f3082; mov.f32 %f3084, 0fBE7FFF22; fma.rn.f32 %f3085, %f3083, %f3072, %f3084; mov.f32 %f3086, 0f3EAAAA78; fma.rn.f32 %f3087, %f3085, %f3072, %f3086; mov.f32 %f3088, 0fBF000000; fma.rn.f32 %f3089, %f3087, %f3072, %f3088; mul.f32 %f3090, %f3072, %f3089; fma.rn.f32 %f3091, %f3090, %f3072, %f3072; mov.f32 %f3092, 0f3F317218; fma.rn.f32 %f4871, %f3071, %f3092, %f3091; setp.lt.u32 %p436, %r1142, 2139095040; @%p436 bra $L__BB0_314; mov.f32 %f3093, 0f7F800000; fma.rn.f32 %f4871, %f396, %f3093, %f3093; $L__BB0_314: setp.eq.f32 %p437, %f396, 0f00000000; selp.f32 %f3094, 0fFF800000, %f4871, %p437; mul.f32 %f3095, %f384, 0f3F2AAAAB; mul.f32 %f3096, %f383, %f395; fma.rn.f32 %f3097, %f383, %f3095, %f3096; mul.f32 %f3098, %f3097, 0f3F000000; fma.rn.f32 %f3099, %f380, %f380, 0fBF800000; mul.f32 %f3100, %f3099, 0f3F000000; sub.f32 %f3101, %f3100, %f3094; mul.f32 %f4872, %f3098, %f3101; mov.f32 %f4873, %f394; $L__BB0_316: add.f32 %f3104, %f4872, %f4873; mul.f32 %f4876, %f3104, %f4874; $L__BB0_319: mov.b32 %f3146, %r1706; max.f32 %f413, %f3146, %f4876; ld.global.u32 %r206, [%rd81+80]; setp.eq.s32 %p440, %r206, 2; @%p440 bra $L__BB0_391; mov.b32 %f414, %r8; and.b16 %rs31, %rs7, 3; setp.eq.s16 %p441, %rs31, 1; @%p441 bra $L__BB0_336; setp.eq.s16 %p442, %rs31, 2; mov.f32 %f4877, 0f3F800000; @%p442 bra $L__BB0_324; setp.ne.s16 %p443, %rs31, 3; @%p443 bra $L__BB0_346; mov.u64 %rd1185, 0; mov.u64 %rd1186, %rd1185; bra.uni $L__BB0_378; $L__BB0_324: ld.global.f32 %f415, [%rd81+8]; div.rn.f32 %f3150, %f381, %f4875; div.rn.f32 %f416, %f3150, %f381; ld.global.u32 %r207, [%rd81+12]; cvt.rn.f32.s32 %f417, %r207; mul.f32 %f3151, %f417, 0f3F000000; cvt.rzi.f32.f32 %f3152, %f3151; add.f32 %f3153, %f3152, %f3152; sub.f32 %f3154, %f417, %f3153; abs.f32 %f418, %f3154; abs.f32 %f419, %f416; setp.lt.f32 %p444, %f419, 0f00800000; mul.f32 %f3155, %f419, 0f4B800000; selp.f32 %f3156, %f3155, %f419, %p444; selp.f32 %f3157, 0fC1C00000, 0f00000000, %p444; mov.b32 %r1146, %f3156; add.s32 %r1147, %r1146, -1060439283; and.b32 %r1148, %r1147, -8388608; sub.s32 %r1149, %r1146, %r1148; mov.b32 %f3158, %r1149; cvt.rn.f32.s32 %f3159, %r1148; mov.f32 %f3160, 0f34000000; fma.rn.f32 %f3161, %f3159, %f3160, %f3157; add.f32 %f3162, %f3158, 0fBF800000; add.f32 %f3148, %f3158, 0f3F800000; mov.f32 %f3149, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f3147,%f3148; // end inline asm add.f32 %f3163, %f3162, %f3162; mul.f32 %f3164, %f3147, %f3163; mul.f32 %f3165, %f3164, %f3164; sub.f32 %f3166, %f3162, %f3164; add.f32 %f3167, %f3166, %f3166; neg.f32 %f3168, %f3164; fma.rn.f32 %f3169, %f3168, %f3162, %f3167; mul.rn.f32 %f3170, %f3147, %f3169; mov.f32 %f3171, 0f3B52E7DB; mov.f32 %f3172, 0f3A2C32E4; fma.rn.f32 %f3173, %f3172, %f3165, %f3171; mov.f32 %f3174, 0f3C93BB73; fma.rn.f32 %f3175, %f3173, %f3165, %f3174; mov.f32 %f3176, 0f3DF6384F; fma.rn.f32 %f3177, %f3175, %f3165, %f3176; mul.rn.f32 %f3178, %f3177, %f3165; mov.f32 %f3179, 0f3FB8AA3B; fma.rn.f32 %f3180, %f3164, %f3179, %f3161; sub.f32 %f3181, %f3161, %f3180; fma.rn.f32 %f3182, %f3164, %f3179, %f3181; fma.rn.f32 %f3183, %f3170, %f3179, %f3182; mov.f32 %f3184, 0f32A55E34; fma.rn.f32 %f3185, %f3164, %f3184, %f3183; mul.f32 %f3186, %f3178, 0f40400000; fma.rn.f32 %f3187, %f3186, %f3170, %f3185; fma.rn.f32 %f3188, %f3178, %f3164, %f3187; add.rn.f32 %f3189, %f3180, %f3188; neg.f32 %f3190, %f3180; add.rn.f32 %f3191, %f3189, %f3190; neg.f32 %f3192, %f3191; add.rn.f32 %f3193, %f3188, %f3192; mul.rn.f32 %f3194, %f3189, %f417; neg.f32 %f3195, %f3194; fma.rn.f32 %f3196, %f3189, %f417, %f3195; fma.rn.f32 %f3197, %f3193, %f417, %f3196; cvt.rni.f32.f32 %f3198, %f3194; sub.f32 %f3199, %f3194, %f3198; add.f32 %f3200, %f3197, %f3199; mov.f32 %f3201, 0f3AAF85ED; mov.f32 %f3202, 0f391FCB8E; fma.rn.f32 %f3203, %f3202, %f3200, %f3201; mov.f32 %f3204, 0f3C1D9856; fma.rn.f32 %f3205, %f3203, %f3200, %f3204; mov.f32 %f3206, 0f3D6357BB; fma.rn.f32 %f3207, %f3205, %f3200, %f3206; mov.f32 %f3208, 0f3E75FDEC; fma.rn.f32 %f3209, %f3207, %f3200, %f3208; mov.f32 %f3210, 0f3F317218; fma.rn.f32 %f3211, %f3209, %f3200, %f3210; fma.rn.f32 %f3212, %f3211, %f3200, %f3149; cvt.rzi.s32.f32 %r1150, %f3198; setp.gt.f32 %p445, %f3198, 0f00000000; selp.b32 %r1151, 0, -2097152000, %p445; add.s32 %r1152, %r1151, 2130706432; mov.b32 %f3213, %r1152; mul.f32 %f3214, %f3212, %f3213; shl.b32 %r1153, %r1150, 23; sub.s32 %r1154, %r1153, %r1151; mov.b32 %f3215, %r1154; mul.f32 %f3216, %f3214, %f3215; abs.f32 %f3217, %f3194; setp.gt.f32 %p446, %f3217, 0f43180000; setp.lt.f32 %p447, %f3194, 0f00000000; selp.f32 %f3218, 0f00000000, 0f7F800000, %p447; selp.f32 %f420, %f3218, %f3216, %p446; setp.eq.f32 %p448, %f416, 0f3F800000; setp.eq.s32 %p449, %r207, 0; or.pred %p450, %p448, %p449; @%p450 bra $L__BB0_333; setp.gtu.f32 %p451, %f419, 0f7F800000; @%p451 bra $L__BB0_332; abs.f32 %f421, %f417; setp.gtu.f32 %p452, %f421, 0f7F800000; @%p452 bra $L__BB0_332; bra.uni $L__BB0_327; $L__BB0_332: add.rn.f32 %f4877, %f416, %f417; $L__BB0_333: add.f32 %f3224, %f4877, 0fBF800000; mul.f32 %f3225, %f415, %f3224; ld.global.f32 %f3226, [%rd81+20]; neg.f32 %f3227, %f3226; max.f32 %f3228, %f3225, %f3227; mul.f32 %f3229, %f4865, %f3228; neg.f32 %f3230, %f3229; mov.f32 %f3231, 0f00000000; st.local.v4.f32 [%rd1], {%f3231, %f3231, %f3231, %f3231}; mov.u64 %rd749, 0; st.local.v2.u64 [%rd89], {%rd749, %rd749}; mov.u32 %r1160, 1065353216; st.local.u32 [%rd89], %r1160; st.local.u32 [%rd89+12], %r1160; ld.local.v4.f32 {%f3232, %f3233, %f3234, %f3235}, [%rd89]; mul.f32 %f4878, %f3232, %f3230; mul.f32 %f4879, %f3233, %f3230; mul.f32 %f4880, %f3234, %f3230; mul.f32 %f4881, %f3235, %f3230; ld.global.f32 %f434, [%rd81+16]; setp.eq.f32 %p464, %f434, 0f00000000; @%p464 bra $L__BB0_335; add.f32 %f3240, %f4866, %f4866; add.f32 %f3241, %f4868, %f4867; add.f32 %f3242, %f4869, %f4869; mul.f32 %f3243, %f3241, 0f3F000000; mul.f32 %f3244, %f3242, 0f3F000000; mul.f32 %f3245, %f3240, 0f3F000000; add.f32 %f3246, %f3245, 0f00000000; add.f32 %f3247, %f3244, %f3246; mul.f32 %f3248, %f3247, 0f3F000000; st.local.v4.f32 [%rd1], {%f3245, %f3243, %f3243, %f3244}; sub.f32 %f3249, %f3245, %f3248; st.local.f32 [%rd1], %f3249; sub.f32 %f3250, %f3244, %f3248; st.local.f32 [%rd1+12], %f3250; ld.local.v4.f32 {%f3251, %f3252, %f3253, %f3254}, [%rd1]; add.f32 %f3255, %f434, %f434; mul.f32 %f3256, %f4865, %f3255; fma.rn.f32 %f4878, %f3256, %f3251, %f4878; fma.rn.f32 %f4879, %f3256, %f3252, %f4879; fma.rn.f32 %f4880, %f3256, %f3253, %f4880; fma.rn.f32 %f4881, %f3256, %f3254, %f4881; $L__BB0_335: mov.b32 %r1161, %f4878; mov.b32 %r1162, %f4879; mov.b64 %rd1185, {%r1161, %r1162}; mov.b32 %r1163, %f4880; mov.b32 %r1164, %f4881; mov.b64 %rd1186, {%r1163, %r1164}; bra.uni $L__BB0_378; $L__BB0_336: ld.global.u64 %rd750, [%rd81+24]; shl.b64 %rd751, %rd70, 4; add.s64 %rd752, %rd750, %rd751; ld.f32 %f3264, [%rd752+8]; mul.f32 %f3265, %f414, 0f3F7FBE77; fma.rn.f32 %f443, %f3265, %f414, 0f3A83126F; mul.f32 %f3266, %f4875, %f4944; sub.f32 %f444, %f3266, %f379; ld.global.f32 %f3267, [%rd81+16]; mul.f32 %f3268, %f3267, 0f3F2AAAAB; ld.global.f32 %f3269, [%rd81+12]; mul.f32 %f3270, %f3264, %f3269; fma.rn.f32 %f445, %f3264, %f3268, %f3270; mul.f32 %f3271, %f4863, %f4863; fma.rn.f32 %f446, %f4875, %f4875, %f3271; mul.f32 %f3272, %f4863, %f4944; fma.rn.f32 %f447, %f4875, %f4864, %f3272; mul.f32 %f3273, %f4944, %f4944; fma.rn.f32 %f448, %f4864, %f4864, %f3273; mul.f32 %f449, %f3264, %f3267; mov.f32 %f3274, 0fBF000000; cvt.rzi.f32.f32 %f3275, %f3274; add.f32 %f3276, %f3275, %f3275; mov.f32 %f3277, 0fBF800000; sub.f32 %f3278, %f3277, %f3276; abs.f32 %f450, %f3278; abs.f32 %f451, %f444; setp.lt.f32 %p465, %f451, 0f00800000; mul.f32 %f3279, %f451, 0f4B800000; selp.f32 %f3280, %f3279, %f451, %p465; selp.f32 %f3281, 0fC1C00000, 0f00000000, %p465; mov.b32 %r1165, %f3280; add.s32 %r1166, %r1165, -1060439283; and.b32 %r1167, %r1166, -8388608; sub.s32 %r1168, %r1165, %r1167; mov.b32 %f3282, %r1168; cvt.rn.f32.s32 %f3283, %r1167; mov.f32 %f3284, 0f34000000; fma.rn.f32 %f3285, %f3283, %f3284, %f3281; add.f32 %f3286, %f3282, 0fBF800000; add.f32 %f3262, %f3282, 0f3F800000; mov.f32 %f4882, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f3261,%f3262; // end inline asm add.f32 %f3287, %f3286, %f3286; mul.f32 %f3288, %f3261, %f3287; mul.f32 %f3289, %f3288, %f3288; sub.f32 %f3290, %f3286, %f3288; add.f32 %f3291, %f3290, %f3290; neg.f32 %f3292, %f3288; fma.rn.f32 %f3293, %f3292, %f3286, %f3291; mul.rn.f32 %f3294, %f3261, %f3293; mov.f32 %f3295, 0f3B52E7DB; mov.f32 %f3296, 0f3A2C32E4; fma.rn.f32 %f3297, %f3296, %f3289, %f3295; mov.f32 %f3298, 0f3C93BB73; fma.rn.f32 %f3299, %f3297, %f3289, %f3298; mov.f32 %f3300, 0f3DF6384F; fma.rn.f32 %f3301, %f3299, %f3289, %f3300; mul.rn.f32 %f3302, %f3301, %f3289; mov.f32 %f3303, 0f3FB8AA3B; fma.rn.f32 %f3304, %f3288, %f3303, %f3285; sub.f32 %f3305, %f3285, %f3304; fma.rn.f32 %f3306, %f3288, %f3303, %f3305; fma.rn.f32 %f3307, %f3294, %f3303, %f3306; mov.f32 %f3308, 0f32A55E34; fma.rn.f32 %f3309, %f3288, %f3308, %f3307; mul.f32 %f3310, %f3302, 0f40400000; fma.rn.f32 %f3311, %f3310, %f3294, %f3309; fma.rn.f32 %f3312, %f3302, %f3288, %f3311; add.rn.f32 %f3313, %f3304, %f3312; neg.f32 %f3314, %f3304; add.rn.f32 %f3315, %f3313, %f3314; neg.f32 %f3316, %f3315; add.rn.f32 %f3317, %f3312, %f3316; mul.rn.f32 %f3318, %f3313, %f3277; neg.f32 %f3319, %f3318; fma.rn.f32 %f3320, %f3313, %f3277, %f3319; fma.rn.f32 %f3321, %f3317, %f3277, %f3320; cvt.rni.f32.f32 %f3322, %f3318; sub.f32 %f3323, %f3318, %f3322; add.f32 %f3324, %f3321, %f3323; mov.f32 %f3325, 0f3AAF85ED; mov.f32 %f3326, 0f391FCB8E; fma.rn.f32 %f3327, %f3326, %f3324, %f3325; mov.f32 %f3328, 0f3C1D9856; fma.rn.f32 %f3329, %f3327, %f3324, %f3328; mov.f32 %f3330, 0f3D6357BB; fma.rn.f32 %f3331, %f3329, %f3324, %f3330; mov.f32 %f3332, 0f3E75FDEC; fma.rn.f32 %f3333, %f3331, %f3324, %f3332; mov.f32 %f3334, 0f3F317218; fma.rn.f32 %f3335, %f3333, %f3324, %f3334; fma.rn.f32 %f3336, %f3335, %f3324, %f4882; cvt.rzi.s32.f32 %r1169, %f3322; setp.gt.f32 %p466, %f3322, 0f00000000; selp.b32 %r1170, 0, -2097152000, %p466; add.s32 %r1171, %r1170, 2130706432; mov.b32 %f3337, %r1171; mul.f32 %f3338, %f3336, %f3337; shl.b32 %r1172, %r1169, 23; sub.s32 %r1173, %r1172, %r1170; mov.b32 %f3339, %r1173; mul.f32 %f3340, %f3338, %f3339; abs.f32 %f3341, %f3318; setp.gt.f32 %p467, %f3341, 0f43180000; setp.lt.f32 %p468, %f3318, 0f00000000; selp.f32 %f3342, 0f00000000, 0f7F800000, %p468; selp.f32 %f452, %f3342, %f3340, %p467; setp.eq.f32 %p469, %f444, 0f3F800000; @%p469 bra $L__BB0_343; setp.gtu.f32 %p470, %f451, 0f7F800000; @%p470 bra $L__BB0_342; bra.uni $L__BB0_338; $L__BB0_342: mov.f32 %f3345, 0fBF800000; add.rn.f32 %f4882, %f444, %f3345; bra.uni $L__BB0_343; $L__BB0_346: ld.global.u64 %rd754, [%rd81+24]; shl.b64 %rd755, %rd70, 4; add.s64 %rd756, %rd754, %rd755; ld.f32 %f484, [%rd756+8]; mul.f32 %f3372, %f4875, %f4944; sub.f32 %f485, %f3372, %f379; ld.local.v4.f32 {%f4875, %f3374, %f3375, %f3376}, [%rd77]; add.f32 %f3378, %f3376, %f4875; mul.f32 %f487, %f3378, 0f3F000000; sub.f32 %f3379, %f4875, %f3376; mul.f32 %f3380, %f3379, 0f3F000000; add.f32 %f3383, %f3374, %f3375; mul.f32 %f3384, %f3383, 0f3F000000; sub.f32 %f3385, %f3374, %f3375; mul.f32 %f488, %f3385, 0f3F000000; mul.f32 %f3386, %f488, %f488; fma.rn.f32 %f3387, %f487, %f487, %f3386; sqrt.rn.f32 %f3388, %f3387; mul.f32 %f3389, %f3384, %f3384; fma.rn.f32 %f3390, %f3380, %f3380, %f3389; sqrt.rn.f32 %f3391, %f3390; add.f32 %f489, %f3388, %f3391; sub.f32 %f490, %f3388, %f3391; abs.f32 %f491, %f3380; abs.f32 %f492, %f3384; setp.eq.f32 %p478, %f491, 0f00000000; setp.eq.f32 %p479, %f492, 0f00000000; and.pred %p480, %p478, %p479; mov.b32 %r208, %f3380; mov.b32 %r1183, %f3384; and.b32 %r209, %r1183, -2147483648; @%p480 bra $L__BB0_350; bra.uni $L__BB0_347; $L__BB0_350: shr.s32 %r1188, %r208, 31; and.b32 %r1189, %r1188, 1078530011; or.b32 %r1190, %r1189, %r209; mov.b32 %f4891, %r1190; bra.uni $L__BB0_351; $L__BB0_347: setp.eq.f32 %p481, %f491, 0f7F800000; setp.eq.f32 %p482, %f492, 0f7F800000; and.pred %p483, %p481, %p482; @%p483 bra $L__BB0_349; bra.uni $L__BB0_348; $L__BB0_349: setp.lt.s32 %p487, %r208, 0; selp.b32 %r1186, 1075235812, 1061752795, %p487; or.b32 %r1187, %r1186, %r209; mov.b32 %f4891, %r1187; bra.uni $L__BB0_351; $L__BB0_348: setp.lt.s32 %p484, %r208, 0; min.f32 %f3392, %f492, %f491; max.f32 %f3393, %f492, %f491; div.rn.f32 %f3394, %f3392, %f3393; mul.rn.f32 %f3395, %f3394, %f3394; mov.f32 %f3396, 0fC0B59883; mov.f32 %f3397, 0fBF52C7EA; fma.rn.f32 %f3398, %f3395, %f3397, %f3396; mov.f32 %f3399, 0fC0D21907; fma.rn.f32 %f3400, %f3398, %f3395, %f3399; mul.f32 %f3401, %f3395, %f3400; mul.f32 %f3402, %f3394, %f3401; add.f32 %f3403, %f3395, 0f41355DC0; mov.f32 %f3404, 0f41E6BD60; fma.rn.f32 %f3405, %f3403, %f3395, %f3404; mov.f32 %f3406, 0f419D92C8; fma.rn.f32 %f3407, %f3405, %f3395, %f3406; rcp.rn.f32 %f3408, %f3407; fma.rn.f32 %f3409, %f3402, %f3408, %f3394; mov.f32 %f3410, 0f3FC90FDB; sub.f32 %f3411, %f3410, %f3409; setp.gt.f32 %p485, %f492, %f491; selp.f32 %f3412, %f3411, %f3409, %p485; mov.f32 %f3413, 0f40490FDB; sub.f32 %f3414, %f3413, %f3412; selp.f32 %f3415, %f3414, %f3412, %p484; mov.b32 %r1184, %f3415; or.b32 %r1185, %r209, %r1184; mov.b32 %f3416, %r1185; add.f32 %f3417, %f491, %f492; setp.le.f32 %p486, %f3417, 0f7F800000; selp.f32 %f4891, %f3416, %f3417, %p486; $L__BB0_351: abs.f32 %f497, %f487; setp.eq.f32 %p488, %f497, 0f00000000; abs.f32 %f498, %f488; setp.eq.f32 %p489, %f498, 0f00000000; and.pred %p490, %p488, %p489; mov.b32 %r210, %f487; mov.b32 %r1191, %f488; and.b32 %r211, %r1191, -2147483648; @%p490 bra $L__BB0_355; bra.uni $L__BB0_352; $L__BB0_355: shr.s32 %r1196, %r210, 31; and.b32 %r1197, %r1196, 1078530011; or.b32 %r1198, %r1197, %r211; mov.b32 %f4892, %r1198; bra.uni $L__BB0_356; $L__BB0_352: setp.eq.f32 %p491, %f497, 0f7F800000; setp.eq.f32 %p492, %f498, 0f7F800000; and.pred %p493, %p491, %p492; @%p493 bra $L__BB0_354; bra.uni $L__BB0_353; $L__BB0_354: setp.lt.s32 %p497, %r210, 0; selp.b32 %r1194, 1075235812, 1061752795, %p497; or.b32 %r1195, %r1194, %r211; mov.b32 %f4892, %r1195; bra.uni $L__BB0_356; $L__BB0_353: setp.lt.s32 %p494, %r210, 0; min.f32 %f3418, %f498, %f497; max.f32 %f3419, %f498, %f497; div.rn.f32 %f3420, %f3418, %f3419; mul.rn.f32 %f3421, %f3420, %f3420; mov.f32 %f3422, 0fC0B59883; mov.f32 %f3423, 0fBF52C7EA; fma.rn.f32 %f3424, %f3421, %f3423, %f3422; mov.f32 %f3425, 0fC0D21907; fma.rn.f32 %f3426, %f3424, %f3421, %f3425; mul.f32 %f3427, %f3421, %f3426; mul.f32 %f3428, %f3420, %f3427; add.f32 %f3429, %f3421, 0f41355DC0; mov.f32 %f3430, 0f41E6BD60; fma.rn.f32 %f3431, %f3429, %f3421, %f3430; mov.f32 %f3432, 0f419D92C8; fma.rn.f32 %f3433, %f3431, %f3421, %f3432; rcp.rn.f32 %f3434, %f3433; fma.rn.f32 %f3435, %f3428, %f3434, %f3420; mov.f32 %f3436, 0f3FC90FDB; sub.f32 %f3437, %f3436, %f3435; setp.gt.f32 %p495, %f498, %f497; selp.f32 %f3438, %f3437, %f3435, %p495; mov.f32 %f3439, 0f40490FDB; sub.f32 %f3440, %f3439, %f3438; selp.f32 %f3441, %f3440, %f3438, %p494; mov.b32 %r1192, %f3441; or.b32 %r1193, %r211, %r1192; mov.b32 %f3442, %r1193; add.f32 %f3443, %f497, %f498; setp.le.f32 %p496, %f3443, 0f7F800000; selp.f32 %f4892, %f3442, %f3443, %p496; $L__BB0_356: sub.f32 %f3444, %f4892, %f4891; mul.f32 %f503, %f3444, 0f3F000000; add.f32 %f3445, %f4891, %f4892; mul.f32 %f504, %f3445, 0f3F000000; mul.f32 %f3446, %f503, 0f3F22F983; cvt.rni.s32.f32 %r1688, %f3446; cvt.rn.f32.s32 %f3447, %r1688; mov.f32 %f3448, 0fBFC90FDA; fma.rn.f32 %f3449, %f3447, %f3448, %f503; mov.f32 %f3450, 0fB3A22168; fma.rn.f32 %f3451, %f3447, %f3450, %f3449; mov.f32 %f3452, 0fA7C234C5; fma.rn.f32 %f4893, %f3447, %f3452, %f3451; abs.f32 %f506, %f503; setp.ltu.f32 %p498, %f506, 0f47CE4780; @%p498 bra $L__BB0_364; setp.eq.f32 %p499, %f506, 0f7F800000; @%p499 bra $L__BB0_363; bra.uni $L__BB0_358; $L__BB0_363: mov.f32 %f3455, 0f00000000; mul.rn.f32 %f4893, %f503, %f3455; mov.u32 %r1688, 0; bra.uni $L__BB0_364; $L__BB0_338: setp.eq.f32 %p471, %f444, 0f00000000; setp.eq.f32 %p472, %f451, 0f7F800000; or.pred %p473, %p471, %p472; @%p473 bra $L__BB0_341; bra.uni $L__BB0_339; $L__BB0_341: setp.eq.f32 %p476, %f450, 0f3F800000; add.f32 %f3344, %f444, %f444; mov.b32 %r1174, %f3344; xor.b32 %r1175, %r1174, 2139095040; and.b32 %r1176, %r1175, 2147483647; selp.b32 %r1177, %r1175, %r1176, %p476; mov.b32 %f4882, %r1177; bra.uni $L__BB0_343; $L__BB0_358: mov.b32 %r213, %f503; shr.u32 %r1201, %r213, 23; and.b32 %r1202, %r1201, 255; add.s32 %r214, %r1202, -128; shl.b32 %r1203, %r213, 8; or.b32 %r215, %r1203, -2147483648; shr.u32 %r216, %r214, 5; mov.u32 %r1684, 0; mov.u64 %rd1181, __cudart_i2opi_f; mov.u64 %rd1182, %rd5; mov.u32 %r1685, %r1684; $L__BB0_359: .pragma "nounroll"; mov.u32 %r218, %r1685; ld.global.nc.u32 %r1206, [%rd1181]; // begin inline asm { mad.lo.cc.u32 %r1204, %r1206, %r215, %r218; madc.hi.u32 %r1685, %r1206, %r215, 0; } // end inline asm st.local.u32 [%rd1182], %r1204; add.s64 %rd1182, %rd1182, 4; add.s64 %rd1181, %rd1181, 4; add.s32 %r1684, %r1684, 1; setp.ne.s32 %p500, %r1684, 6; @%p500 bra $L__BB0_359; add.s64 %rd1098, %rd5, 24; mov.u32 %r1211, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1209, %r1211, %r215, %r218; madc.hi.u32 %r1210, %r1211, %r215, 0; } // end inline asm st.local.u32 [%rd1098], %r1210; mov.u32 %r1214, 4; sub.s32 %r221, %r1214, %r216; mov.u32 %r1215, 6; sub.s32 %r1216, %r1215, %r216; mul.wide.s32 %rd758, %r1216, 4; add.s64 %rd759, %rd5, %rd758; ld.local.u32 %r1686, [%rd759]; ld.local.u32 %r1687, [%rd759+-4]; and.b32 %r224, %r214, 31; setp.eq.s32 %p501, %r224, 0; @%p501 bra $L__BB0_362; mov.u32 %r1217, 32; sub.s32 %r1218, %r1217, %r224; shr.u32 %r1219, %r1687, %r1218; shl.b32 %r1220, %r1686, %r224; add.s32 %r1686, %r1219, %r1220; mul.wide.s32 %rd760, %r221, 4; add.s64 %rd761, %rd5, %rd760; ld.local.u32 %r1221, [%rd761]; shr.u32 %r1222, %r1221, %r1218; shl.b32 %r1223, %r1687, %r224; add.s32 %r1687, %r1222, %r1223; $L__BB0_362: and.b32 %r1224, %r213, -2147483648; shr.u32 %r1225, %r1687, 30; shl.b32 %r1226, %r1686, 2; or.b32 %r1227, %r1225, %r1226; shr.u32 %r1228, %r1227, 31; shr.u32 %r1229, %r1686, 30; add.s32 %r1230, %r1228, %r1229; neg.s32 %r1231, %r1230; setp.eq.s32 %p502, %r1224, 0; selp.b32 %r1688, %r1230, %r1231, %p502; setp.ne.s32 %p503, %r1228, 0; xor.b32 %r1232, %r1224, -2147483648; selp.b32 %r1233, %r1232, %r1224, %p503; selp.b32 %r1234, -1, 0, %p503; xor.b32 %r1235, %r1227, %r1234; shl.b32 %r1236, %r1687, 2; xor.b32 %r1237, %r1236, %r1234; cvt.u64.u32 %rd762, %r1235; cvt.u64.u32 %rd763, %r1237; bfi.b64 %rd764, %rd762, %rd763, 32, 32; cvt.rn.f64.s64 %fd19, %rd764; mul.f64 %fd20, %fd19, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f3453, %fd20; setp.eq.s32 %p504, %r1233, 0; neg.f32 %f3454, %f3453; selp.f32 %f4893, %f3453, %f3454, %p504; $L__BB0_364: mul.f32 %f3456, %f504, 0f3F22F983; cvt.rni.s32.f32 %r1693, %f3456; cvt.rn.f32.s32 %f3457, %r1693; fma.rn.f32 %f3459, %f3457, %f3448, %f504; fma.rn.f32 %f3461, %f3457, %f3450, %f3459; fma.rn.f32 %f4894, %f3457, %f3452, %f3461; abs.f32 %f511, %f504; setp.ltu.f32 %p505, %f511, 0f47CE4780; @%p505 bra $L__BB0_372; setp.eq.f32 %p506, %f511, 0f7F800000; @%p506 bra $L__BB0_371; bra.uni $L__BB0_366; $L__BB0_371: mov.f32 %f3465, 0f00000000; mul.rn.f32 %f4894, %f504, %f3465; mov.u32 %r1693, 0; bra.uni $L__BB0_372; $L__BB0_366: mov.b32 %r232, %f504; shr.u32 %r1241, %r232, 23; and.b32 %r1242, %r1241, 255; add.s32 %r233, %r1242, -128; shl.b32 %r1243, %r232, 8; or.b32 %r234, %r1243, -2147483648; shr.u32 %r235, %r233, 5; mov.u32 %r1689, 0; mov.u64 %rd1183, __cudart_i2opi_f; mov.u64 %rd1184, %rd5; mov.u32 %r1690, %r1689; $L__BB0_367: .pragma "nounroll"; mov.u32 %r237, %r1690; ld.global.nc.u32 %r1246, [%rd1183]; // begin inline asm { mad.lo.cc.u32 %r1244, %r1246, %r234, %r237; madc.hi.u32 %r1690, %r1246, %r234, 0; } // end inline asm st.local.u32 [%rd1184], %r1244; add.s64 %rd1184, %rd1184, 4; add.s64 %rd1183, %rd1183, 4; add.s32 %r1689, %r1689, 1; setp.ne.s32 %p507, %r1689, 6; @%p507 bra $L__BB0_367; add.s64 %rd1099, %rd5, 24; mov.u32 %r1251, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1249, %r1251, %r234, %r237; madc.hi.u32 %r1250, %r1251, %r234, 0; } // end inline asm st.local.u32 [%rd1099], %r1250; mov.u32 %r1254, 4; sub.s32 %r240, %r1254, %r235; mov.u32 %r1255, 6; sub.s32 %r1256, %r1255, %r235; mul.wide.s32 %rd766, %r1256, 4; add.s64 %rd767, %rd5, %rd766; ld.local.u32 %r1691, [%rd767]; ld.local.u32 %r1692, [%rd767+-4]; and.b32 %r243, %r233, 31; setp.eq.s32 %p508, %r243, 0; @%p508 bra $L__BB0_370; mov.u32 %r1257, 32; sub.s32 %r1258, %r1257, %r243; shr.u32 %r1259, %r1692, %r1258; shl.b32 %r1260, %r1691, %r243; add.s32 %r1691, %r1259, %r1260; mul.wide.s32 %rd768, %r240, 4; add.s64 %rd769, %rd5, %rd768; ld.local.u32 %r1261, [%rd769]; shr.u32 %r1262, %r1261, %r1258; shl.b32 %r1263, %r1692, %r243; add.s32 %r1692, %r1262, %r1263; $L__BB0_370: and.b32 %r1264, %r232, -2147483648; shr.u32 %r1265, %r1692, 30; shl.b32 %r1266, %r1691, 2; or.b32 %r1267, %r1265, %r1266; shr.u32 %r1268, %r1267, 31; shr.u32 %r1269, %r1691, 30; add.s32 %r1270, %r1268, %r1269; neg.s32 %r1271, %r1270; setp.eq.s32 %p509, %r1264, 0; selp.b32 %r1693, %r1270, %r1271, %p509; setp.ne.s32 %p510, %r1268, 0; xor.b32 %r1272, %r1264, -2147483648; selp.b32 %r1273, %r1272, %r1264, %p510; selp.b32 %r1274, -1, 0, %p510; xor.b32 %r1275, %r1267, %r1274; shl.b32 %r1276, %r1692, 2; xor.b32 %r1277, %r1276, %r1274; cvt.u64.u32 %rd770, %r1275; cvt.u64.u32 %rd771, %r1277; bfi.b64 %rd772, %rd770, %rd771, 32, 32; cvt.rn.f64.s64 %fd21, %rd772; mul.f64 %fd22, %fd21, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f3463, %fd22; setp.eq.s32 %p511, %r1273, 0; neg.f32 %f3464, %f3463; selp.f32 %f4894, %f3463, %f3464, %p511; $L__BB0_372: setp.lt.f32 %p512, %f490, 0f00000000; mov.f32 %f3466, 0f00000000; selp.f32 %f3467, 0fBF800000, 0f3F800000, %p512; mov.f32 %f3468, 0f3F800000; mul.f32 %f3469, %f4893, %f4893; mov.f32 %f3470, 0fBAB607ED; mov.f32 %f3471, 0f37CBAC00; fma.rn.f32 %f3472, %f3471, %f3469, %f3470; mov.f32 %f3473, 0f3D2AAABB; fma.rn.f32 %f3474, %f3472, %f3469, %f3473; mov.f32 %f3475, 0fBEFFFFFF; fma.rn.f32 %f3476, %f3474, %f3469, %f3475; fma.rn.f32 %f3477, %f3476, %f3469, %f3468; mov.f32 %f3478, 0f3C0885E4; mov.f32 %f3479, 0fB94D4153; fma.rn.f32 %f3480, %f3479, %f3469, %f3478; mov.f32 %f3481, 0fBE2AAAA8; fma.rn.f32 %f3482, %f3480, %f3469, %f3481; fma.rn.f32 %f3483, %f3469, %f4893, %f3466; fma.rn.f32 %f3484, %f3482, %f3483, %f4893; and.b32 %r1279, %r1688, 1; setp.eq.b32 %p513, %r1279, 1; selp.f32 %f3485, %f3477, %f3484, %p513; selp.f32 %f3486, %f3484, %f3477, %p513; neg.f32 %f3487, %f3485; and.b32 %r1280, %r1688, 2; setp.eq.s32 %p514, %r1280, 0; selp.f32 %f3488, %f3485, %f3487, %p514; neg.f32 %f3489, %f3486; add.s32 %r1281, %r1688, 1; and.b32 %r1282, %r1281, 2; setp.eq.s32 %p515, %r1282, 0; selp.f32 %f3490, %f3486, %f3489, %p515; mul.f32 %f3491, %f4894, %f4894; fma.rn.f32 %f3492, %f3471, %f3491, %f3470; fma.rn.f32 %f3493, %f3492, %f3491, %f3473; fma.rn.f32 %f3494, %f3493, %f3491, %f3475; fma.rn.f32 %f3495, %f3494, %f3491, %f3468; fma.rn.f32 %f3496, %f3491, %f4894, %f3466; fma.rn.f32 %f3497, %f3479, %f3491, %f3478; fma.rn.f32 %f3498, %f3497, %f3491, %f3481; fma.rn.f32 %f3499, %f3498, %f3496, %f4894; and.b32 %r1283, %r1693, 1; setp.eq.b32 %p516, %r1283, 1; selp.f32 %f3500, %f3495, %f3499, %p516; selp.f32 %f3501, %f3499, %f3495, %p516; and.b32 %r1284, %r1693, 2; setp.eq.s32 %p517, %r1284, 0; neg.f32 %f3502, %f3500; selp.f32 %f3503, %f3500, %f3502, %p517; add.s32 %r1285, %r1693, 1; and.b32 %r1286, %r1285, 2; setp.eq.s32 %p518, %r1286, 0; neg.f32 %f3504, %f3501; selp.f32 %f3505, %f3501, %f3504, %p518; mov.b32 %r1287, %f3505; neg.f32 %f3506, %f3503; mov.b32 %r1288, %f3503; cvt.u64.u32 %rd773, %r1288; mov.b32 %r1289, %f3506; cvt.u64.u32 %rd774, %r1289; cvt.u64.u32 %rd775, %r1287; bfi.b64 %rd776, %rd775, %rd774, 32, 32; mov.b64 {%r1290, %r1291}, %rd776; bfi.b64 %rd777, %rd773, %rd775, 32, 32; mov.b64 {%r1292, %r1293}, %rd777; mul.f32 %f3507, %f3467, %f3488; mov.b32 %r1294, %f3507; cvt.u64.u32 %rd778, %r1294; mov.b32 %r1295, %f3490; cvt.u64.u32 %rd779, %r1295; neg.f32 %f3508, %f3488; mov.b32 %r1296, %f3508; mul.f32 %f3509, %f3467, %f3490; mov.b32 %r1297, %f3509; cvt.u64.u32 %rd780, %r1297; cvt.u64.u32 %rd781, %r1296; bfi.b64 %rd782, %rd780, %rd781, 32, 32; mov.b64 {%r1298, %r1299}, %rd782; bfi.b64 %rd783, %rd778, %rd779, 32, 32; mov.b64 {%r1300, %r1301}, %rd783; add.f32 %f515, %f489, 0fBF800000; fma.rn.f32 %f516, %f490, %f3467, 0fBF800000; mov.b32 %f517, %r1292; mov.b32 %f518, %r1293; mov.b32 %f519, %r1290; mov.b32 %f520, %r1291; mov.b32 %f521, %r1300; mov.b32 %f522, %r1301; mov.b32 %f523, %r1298; mov.b32 %f524, %r1299; add.f32 %f525, %f485, 0fBF800000; setp.eq.f32 %p519, %f414, 0f3F800000; @%p519 bra $L__BB0_377; bra.uni $L__BB0_373; $L__BB0_377: ld.global.f32 %f3575, [%rd81+20]; add.f32 %f3576, %f3575, %f3575; mul.f32 %f3577, %f484, %f3576; mul.f32 %f3578, %f515, %f517; mul.f32 %f3579, %f515, %f518; mul.f32 %f3580, %f516, %f519; mul.f32 %f3581, %f522, %f3580; fma.rn.f32 %f3582, %f521, %f3578, %f3581; mul.f32 %f3583, %f516, %f520; mul.f32 %f3584, %f522, %f3583; fma.rn.f32 %f3585, %f521, %f3579, %f3584; mul.f32 %f3586, %f524, %f3580; fma.rn.f32 %f3587, %f523, %f3578, %f3586; mul.f32 %f3588, %f524, %f3583; fma.rn.f32 %f3589, %f523, %f3579, %f3588; mul.f32 %f3590, %f3582, %f3577; mul.f32 %f3591, %f3585, %f3577; mul.f32 %f3592, %f3587, %f3577; mul.f32 %f3593, %f3589, %f3577; mul.f32 %f3594, %f4863, %f3592; fma.rn.f32 %f3595, %f4875, %f3590, %f3594; mul.f32 %f3596, %f4863, %f3593; fma.rn.f32 %f3597, %f4875, %f3591, %f3596; mul.f32 %f3598, %f3592, %f4944; fma.rn.f32 %f3599, %f3590, %f4864, %f3598; mul.f32 %f3600, %f3593, %f4944; fma.rn.f32 %f3601, %f3591, %f4864, %f3600; ld.global.f32 %f3602, [%rd81+16]; mul.f32 %f3603, %f484, %f3602; mul.f32 %f3604, %f525, %f3603; mul.f32 %f3605, %f485, %f3604; mov.u64 %rd785, 0; st.local.v2.u64 [%rd1], {%rd785, %rd785}; mov.u32 %r1307, 1065353216; st.local.u32 [%rd1], %r1307; st.local.u32 [%rd1+12], %r1307; ld.local.v4.f32 {%f3606, %f3607, %f3608, %f3609}, [%rd1]; fma.rn.f32 %f3614, %f3605, %f3607, %f3597; mov.b32 %r1308, %f3614; fma.rn.f32 %f3615, %f3605, %f3606, %f3595; mov.b32 %r1309, %f3615; fma.rn.f32 %f3616, %f3605, %f3609, %f3601; mov.b32 %r1310, %f3616; fma.rn.f32 %f3617, %f3605, %f3608, %f3599; mov.b32 %r1311, %f3617; st.local.v4.f32 [%rd89], {%f3615, %f3614, %f3617, %f3616}; mov.b64 %rd1186, {%r1311, %r1310}; mov.b64 %rd1185, {%r1309, %r1308}; bra.uni $L__BB0_378; $L__BB0_373: ld.global.f32 %f3510, [%rd81+20]; add.f32 %f3511, %f3510, %f3510; mul.f32 %f3512, %f484, %f3511; max.f32 %f3514, %f515, %f3466; mul.f32 %f3515, %f517, %f3514; mul.f32 %f3516, %f518, %f3514; max.f32 %f3517, %f516, %f3466; mul.f32 %f3518, %f519, %f3517; mul.f32 %f3519, %f520, %f3517; mul.f32 %f3520, %f522, %f3518; fma.rn.f32 %f3521, %f521, %f3515, %f3520; mul.f32 %f3522, %f522, %f3519; fma.rn.f32 %f3523, %f521, %f3516, %f3522; mul.f32 %f3524, %f524, %f3518; fma.rn.f32 %f3525, %f523, %f3515, %f3524; mul.f32 %f3526, %f524, %f3519; fma.rn.f32 %f3527, %f523, %f3516, %f3526; mul.f32 %f3528, %f3521, %f3512; mul.f32 %f3529, %f3523, %f3512; mul.f32 %f3530, %f3525, %f3512; mul.f32 %f3531, %f3527, %f3512; mul.f32 %f3532, %f4863, %f3530; fma.rn.f32 %f4895, %f4875, %f3528, %f3532; mul.f32 %f3533, %f4863, %f3531; fma.rn.f32 %f4896, %f4875, %f3529, %f3533; mul.f32 %f3534, %f3530, %f4944; fma.rn.f32 %f4897, %f3528, %f4864, %f3534; mul.f32 %f3535, %f3531, %f4944; fma.rn.f32 %f4898, %f3529, %f4864, %f3535; min.f32 %f3536, %f515, %f3466; mul.f32 %f3537, %f517, %f3536; mul.f32 %f3538, %f518, %f3536; min.f32 %f3539, %f516, %f3466; mul.f32 %f3540, %f519, %f3539; mul.f32 %f3541, %f520, %f3539; mul.f32 %f3542, %f522, %f3540; fma.rn.f32 %f3543, %f521, %f3537, %f3542; mul.f32 %f3544, %f522, %f3541; fma.rn.f32 %f3545, %f521, %f3538, %f3544; mul.f32 %f3546, %f524, %f3540; fma.rn.f32 %f3547, %f523, %f3537, %f3546; mul.f32 %f3548, %f524, %f3541; fma.rn.f32 %f3549, %f523, %f3538, %f3548; mul.f32 %f3550, %f3512, %f3543; mul.f32 %f3551, %f3512, %f3545; mul.f32 %f3552, %f3512, %f3547; mul.f32 %f3553, %f3512, %f3549; mul.f32 %f3554, %f4863, %f3552; fma.rn.f32 %f4899, %f4875, %f3550, %f3554; mul.f32 %f3555, %f4863, %f3553; fma.rn.f32 %f4900, %f4875, %f3551, %f3555; mul.f32 %f3556, %f3552, %f4944; fma.rn.f32 %f4901, %f3550, %f4864, %f3556; mul.f32 %f3557, %f3553, %f4944; fma.rn.f32 %f4902, %f3551, %f4864, %f3557; ld.global.f32 %f3558, [%rd81+16]; mul.f32 %f3559, %f484, %f3558; mul.f32 %f3560, %f525, %f3559; mul.f32 %f3561, %f485, %f3560; st.local.v4.f32 [%rd1], {%f3466, %f3466, %f3466, %f3466}; mov.u64 %rd784, 0; st.local.v2.u64 [%rd89], {%rd784, %rd784}; mov.u32 %r1302, 1065353216; st.local.u32 [%rd89], %r1302; st.local.u32 [%rd89+12], %r1302; ld.local.v4.f32 {%f3562, %f3563, %f3564, %f3565}, [%rd89]; mul.f32 %f534, %f3561, %f3562; mul.f32 %f535, %f3561, %f3563; mul.f32 %f536, %f3561, %f3564; mul.f32 %f537, %f3561, %f3565; setp.lt.f32 %p520, %f485, 0f3F800000; @%p520 bra $L__BB0_375; bra.uni $L__BB0_374; $L__BB0_375: add.f32 %f4899, %f4899, %f534; add.f32 %f4900, %f4900, %f535; add.f32 %f4901, %f4901, %f536; add.f32 %f4902, %f4902, %f537; bra.uni $L__BB0_376; $L__BB0_374: add.f32 %f4895, %f4895, %f534; add.f32 %f4896, %f4896, %f535; add.f32 %f4897, %f4897, %f536; add.f32 %f4898, %f4898, %f537; $L__BB0_376: ld.global.u8 %rs32, [%rd81+8]; setp.ne.s16 %p521, %rs32, 0; setp.eq.f32 %p522, %f414, 0f00000000; and.pred %p523, %p522, %p521; selp.f32 %f3570, 0f00000000, 0f3F800000, %p523; fma.rn.f32 %f3571, %f4896, %f3570, %f4900; mov.b32 %r1303, %f3571; fma.rn.f32 %f3572, %f4895, %f3570, %f4899; mov.b32 %r1304, %f3572; fma.rn.f32 %f3573, %f4898, %f3570, %f4902; mov.b32 %r1305, %f3573; fma.rn.f32 %f3574, %f4897, %f3570, %f4901; mov.b32 %r1306, %f3574; mov.b64 %rd1186, {%r1306, %r1305}; mov.b64 %rd1185, {%r1304, %r1303}; bra.uni $L__BB0_378; $L__BB0_327: setp.eq.f32 %p453, %f416, 0f00000000; setp.eq.f32 %p454, %f419, 0f7F800000; or.pred %p455, %p453, %p454; @%p455 bra $L__BB0_331; bra.uni $L__BB0_328; $L__BB0_331: setp.eq.f32 %p462, %f418, 0f3F800000; add.f32 %f3223, %f416, %f416; mov.b32 %r1155, %f3223; xor.b32 %r1156, %r1155, 2139095040; setp.lt.s32 %p463, %r207, 0; selp.b32 %r1157, %r1156, %r1155, %p463; and.b32 %r1158, %r1157, 2147483647; selp.b32 %r1159, %r1157, %r1158, %p462; mov.b32 %f4877, %r1159; bra.uni $L__BB0_333; $L__BB0_216: setp.eq.f32 %p311, %f263, 0f00000000; setp.eq.f32 %p312, %f264, 0f7F800000; or.pred %p313, %p311, %p312; @%p313 bra $L__BB0_219; bra.uni $L__BB0_217; $L__BB0_219: mov.f32 %f2402, 0f3E800000; cvt.rzi.f32.f32 %f2403, %f2402; add.f32 %f2404, %f2403, %f2403; mov.f32 %f2405, 0f3F000000; sub.f32 %f2406, %f2405, %f2404; abs.f32 %f2407, %f2406; setp.eq.f32 %p319, %f2407, 0f3F800000; add.f32 %f2408, %f263, %f263; mov.b32 %r884, %f2408; and.b32 %r885, %r884, 2147483647; selp.b32 %r886, %r884, %r885, %p319; mov.b32 %f4845, %r886; bra.uni $L__BB0_221; $L__BB0_229: setp.lt.f32 %p329, %f276, 0f00800000; mul.f32 %f2466, %f276, 0f4B800000; selp.f32 %f2467, %f2466, %f276, %p329; mov.b32 %r903, %f2467; add.s32 %r904, %r903, -1060439283; and.b32 %r905, %r904, -8388608; sub.s32 %r906, %r903, %r905; mov.b32 %f2468, %r906; cvt.rn.f32.s32 %f2469, %r905; selp.f32 %f2470, 0fC1C00000, 0f00000000, %p329; mov.f32 %f2471, 0f34000000; fma.rn.f32 %f2472, %f2469, %f2471, %f2470; add.f32 %f2473, %f2468, 0fBF800000; add.f32 %f2465, %f2468, 0f3F800000; mov.f32 %f2474, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f2464,%f2465; // end inline asm add.f32 %f2475, %f2473, %f2473; mul.f32 %f2476, %f2464, %f2475; mul.f32 %f2477, %f2476, %f2476; neg.f32 %f2478, %f2476; sub.f32 %f2479, %f2473, %f2476; add.f32 %f2480, %f2479, %f2479; fma.rn.f32 %f2481, %f2478, %f2473, %f2480; mul.rn.f32 %f2482, %f2464, %f2481; mov.f32 %f2483, 0f3B52E7DB; mov.f32 %f2484, 0f3A2C32E4; fma.rn.f32 %f2485, %f2484, %f2477, %f2483; mov.f32 %f2486, 0f3C93BB73; fma.rn.f32 %f2487, %f2485, %f2477, %f2486; mov.f32 %f2488, 0f3DF6384F; fma.rn.f32 %f2489, %f2487, %f2477, %f2488; mul.rn.f32 %f2490, %f2489, %f2477; mov.f32 %f2491, 0f3FB8AA3B; fma.rn.f32 %f2492, %f2476, %f2491, %f2472; mul.f32 %f2493, %f2490, 0f40400000; sub.f32 %f2494, %f2472, %f2492; fma.rn.f32 %f2495, %f2476, %f2491, %f2494; fma.rn.f32 %f2496, %f2482, %f2491, %f2495; mov.f32 %f2497, 0f32A55E34; fma.rn.f32 %f2498, %f2476, %f2497, %f2496; fma.rn.f32 %f2499, %f2493, %f2482, %f2498; fma.rn.f32 %f2500, %f2490, %f2476, %f2499; add.rn.f32 %f2501, %f2492, %f2500; mov.f32 %f2502, 0f3F000000; mul.rn.f32 %f2503, %f2501, %f2502; cvt.rni.f32.f32 %f2504, %f2503; sub.f32 %f2505, %f2503, %f2504; neg.f32 %f2506, %f2503; fma.rn.f32 %f2507, %f2501, %f2502, %f2506; neg.f32 %f2508, %f2492; add.rn.f32 %f2509, %f2501, %f2508; neg.f32 %f2510, %f2509; add.rn.f32 %f2511, %f2500, %f2510; fma.rn.f32 %f2512, %f2511, %f2502, %f2507; add.f32 %f2513, %f2512, %f2505; setp.gt.f32 %p330, %f2504, 0f00000000; selp.b32 %r907, 0, -2097152000, %p330; setp.geu.f32 %p331, %f275, 0f00000000; setp.lt.f32 %p332, %f2503, 0f00000000; selp.f32 %f2514, 0f00000000, 0f7F800000, %p332; abs.f32 %f2515, %f2503; setp.gt.f32 %p333, %f2515, 0f43180000; cvt.rzi.s32.f32 %r908, %f2504; shl.b32 %r909, %r908, 23; sub.s32 %r910, %r909, %r907; mov.b32 %f2516, %r910; add.s32 %r911, %r907, 2130706432; mov.b32 %f2517, %r911; mov.f32 %f2518, 0f3AAF85ED; mov.f32 %f2519, 0f391FCB8E; fma.rn.f32 %f2520, %f2519, %f2513, %f2518; mov.f32 %f2521, 0f3C1D9856; fma.rn.f32 %f2522, %f2520, %f2513, %f2521; mov.f32 %f2523, 0f3D6357BB; fma.rn.f32 %f2524, %f2522, %f2513, %f2523; mov.f32 %f2525, 0f3E75FDEC; fma.rn.f32 %f2526, %f2524, %f2513, %f2525; mov.f32 %f2527, 0f3F317218; fma.rn.f32 %f2528, %f2526, %f2513, %f2527; fma.rn.f32 %f2529, %f2528, %f2513, %f2474; mul.f32 %f2530, %f2529, %f2517; mul.f32 %f2531, %f2530, %f2516; selp.f32 %f4848, %f2514, %f2531, %p333; @%p331 bra $L__BB0_233; mov.f32 %f4848, 0f7FFFFFFF; $L__BB0_233: ld.global.u8 %rs26, [%rd81+48]; setp.eq.s16 %p335, %rs26, 0; @%p335 bra $L__BB0_237; div.rn.f32 %f2541, %f219, %f275; setp.lt.f32 %p336, %f2541, 0f00800000; mul.f32 %f2542, %f2541, 0f4B000000; selp.f32 %f281, %f2542, %f2541, %p336; selp.f32 %f2543, 0fC1B80000, 0f00000000, %p336; mov.b32 %r915, %f281; add.s32 %r916, %r915, -1059760811; and.b32 %r917, %r916, -8388608; sub.s32 %r918, %r915, %r917; mov.b32 %f2544, %r918; cvt.rn.f32.s32 %f2545, %r917; mov.f32 %f2546, 0f34000000; fma.rn.f32 %f2547, %f2545, %f2546, %f2543; add.f32 %f2548, %f2544, 0fBF800000; mov.f32 %f2549, 0f3E1039F6; mov.f32 %f2550, 0fBE055027; fma.rn.f32 %f2551, %f2550, %f2548, %f2549; mov.f32 %f2552, 0fBDF8CDCC; fma.rn.f32 %f2553, %f2551, %f2548, %f2552; mov.f32 %f2554, 0f3E0F2955; fma.rn.f32 %f2555, %f2553, %f2548, %f2554; mov.f32 %f2556, 0fBE2AD8B9; fma.rn.f32 %f2557, %f2555, %f2548, %f2556; mov.f32 %f2558, 0f3E4CED0B; fma.rn.f32 %f2559, %f2557, %f2548, %f2558; mov.f32 %f2560, 0fBE7FFF22; fma.rn.f32 %f2561, %f2559, %f2548, %f2560; mov.f32 %f2562, 0f3EAAAA78; fma.rn.f32 %f2563, %f2561, %f2548, %f2562; mov.f32 %f2564, 0fBF000000; fma.rn.f32 %f2565, %f2563, %f2548, %f2564; mul.f32 %f2566, %f2548, %f2565; fma.rn.f32 %f2567, %f2566, %f2548, %f2548; mov.f32 %f2568, 0f3F317218; fma.rn.f32 %f4849, %f2547, %f2568, %f2567; setp.lt.u32 %p337, %r915, 2139095040; @%p337 bra $L__BB0_236; mov.f32 %f2569, 0f7F800000; fma.rn.f32 %f4849, %f281, %f2569, %f2569; $L__BB0_236: setp.eq.f32 %p338, %f281, 0f00000000; selp.f32 %f2570, 0fFF800000, %f4849, %p338; add.f32 %f4851, %f4851, %f2570; $L__BB0_237: mov.b64 {%r919, %r920}, %rd156; mov.b64 {%r921, %r922}, %rd155; mov.b32 %f2571, %r921; mul.f32 %f2572, %f2571, %f4848; mov.b32 %f2573, %r922; mul.f32 %f2574, %f2573, %f4848; mov.b32 %f2575, %r919; mul.f32 %f2576, %f2575, %f4848; mov.b32 %f2577, %r920; mul.f32 %f2578, %f2577, %f4848; mov.b64 {%r923, %r924}, %rd158; mov.b64 {%r925, %r926}, %rd157; mov.b32 %f2579, %r925; mov.b32 %f2580, %r926; mul.f32 %f2581, %f2580, %f2576; mul.f32 %f2582, %f2580, %f2578; mov.b32 %f2583, %r923; mov.b32 %f2584, %r924; mul.f32 %f2585, %f2584, %f2576; mul.f32 %f2586, %f2584, %f2578; fma.rn.f32 %f2587, %f2579, %f2574, %f2582; mov.b32 %r927, %f2587; fma.rn.f32 %f2588, %f2579, %f2572, %f2581; mov.b32 %r928, %f2588; fma.rn.f32 %f2589, %f2583, %f2574, %f2586; mov.b32 %r929, %f2589; fma.rn.f32 %f2590, %f2583, %f2572, %f2585; mov.b32 %r930, %f2590; mov.b64 %rd1158, {%r930, %r929}; mov.b64 %rd1157, {%r928, %r927}; bra.uni $L__BB0_238; $L__BB0_339: setp.geu.f32 %p474, %f444, 0f00000000; mov.f32 %f4882, %f452; @%p474 bra $L__BB0_343; setp.eq.f32 %p475, %f450, 0f3F800000; neg.f32 %f3343, %f452; selp.f32 %f4882, %f3343, %f452, %p475; $L__BB0_343: add.f32 %f3346, %f446, 0f00000000; add.f32 %f3347, %f3346, %f448; mul.f32 %f3348, %f3347, 0f3F000000; sub.f32 %f3349, %f446, %f3348; sub.f32 %f3350, %f448, %f3348; mul.f32 %f3351, %f449, %f4882; mul.f32 %f4883, %f3349, %f3351; mul.f32 %f4884, %f447, %f3351; mul.f32 %f4886, %f3350, %f3351; fma.rn.f32 %f3352, %f444, %f444, 0fBF800000; mul.f32 %f3353, %f445, 0f3F000000; mul.f32 %f3354, %f3352, %f3353; mov.f32 %f3355, 0f00000000; st.local.v4.f32 [%rd1], {%f3355, %f3355, %f3355, %f3355}; mov.u64 %rd753, 0; st.local.v2.u64 [%rd89], {%rd753, %rd753}; mov.u32 %r1178, 1065353216; st.local.u32 [%rd89], %r1178; st.local.u32 [%rd89+12], %r1178; ld.local.v4.f32 {%f3356, %f3357, %f3358, %f3359}, [%rd89]; mul.f32 %f4887, %f3354, %f3356; mul.f32 %f4888, %f3354, %f3357; mul.f32 %f4889, %f3354, %f3358; mul.f32 %f4890, %f3354, %f3359; setp.ltu.f32 %p477, %f444, 0f3F800000; mov.f32 %f4885, %f4884; @%p477 bra $L__BB0_345; add.f32 %f4883, %f4883, %f4887; add.f32 %f473, %f4884, %f4888; add.f32 %f4885, %f4884, %f4889; add.f32 %f4886, %f4886, %f4890; st.local.v4.f32 [%rd1], {%f3355, %f3355, %f3355, %f3355}; mov.f32 %f4884, %f473; mov.f32 %f4887, %f3355; mov.f32 %f4888, %f3355; mov.f32 %f4889, %f3355; mov.f32 %f4890, %f3355; $L__BB0_345: fma.rn.f32 %f3368, %f443, %f4885, %f4889; mov.b32 %r1179, %f3368; fma.rn.f32 %f3369, %f443, %f4886, %f4890; mov.b32 %r1180, %f3369; fma.rn.f32 %f3370, %f443, %f4883, %f4887; mov.b32 %r1181, %f3370; fma.rn.f32 %f3371, %f443, %f4884, %f4888; mov.b32 %r1182, %f3371; mov.b64 %rd1185, {%r1181, %r1182}; mov.b64 %rd1186, {%r1179, %r1180}; $L__BB0_378: setp.eq.s32 %p525, %r206, 1; mov.pred %p723, 0; @%p525 bra $L__BB0_390; mov.b64 {%r1312, %r1313}, %rd1186; mov.b64 {%r1314, %r1315}, %rd1185; mov.b32 %f563, %r1314; abs.f32 %f3618, %f563; mov.b32 %f4905, %r1315; abs.f32 %f3619, %f4905; setp.le.f32 %p526, %f3619, %f3618; selp.f32 %f3620, %f3618, %f3619, %p526; mov.b32 %f565, %r1312; abs.f32 %f3621, %f565; setp.le.f32 %p527, %f3621, %f3620; selp.f32 %f3622, %f3620, %f3621, %p527; mov.b32 %f4906, %r1313; abs.f32 %f3623, %f4906; setp.le.f32 %p528, %f3623, %f3622; selp.f32 %f567, %f3622, %f3623, %p528; setp.eq.f32 %p529, %f567, 0f00000000; @%p529 bra $L__BB0_381; div.rn.f32 %f4905, %f4905, %f567; div.rn.f32 %f4906, %f4906, %f567; mov.b32 %r1316, %f4905; div.rn.f32 %f3624, %f563, %f567; mov.b32 %r1317, %f3624; mov.b64 %rd1185, {%r1317, %r1316}; $L__BB0_381: fma.rn.f32 %f3626, %f4905, %f4905, 0f00000000; sqrt.rn.f32 %f3627, %f3626; setp.ltu.f32 %p530, %f4905, 0f00000000; selp.f32 %f3628, 0fBF800000, 0f3F800000, %p530; neg.f32 %f3629, %f4905; selp.f32 %f3630, %f3629, %f4905, %p530; mul.f32 %f4907, %f3628, %f3627; fma.rn.f32 %f3631, %f3630, %f3627, %f3626; add.f32 %f573, %f3631, %f3631; setp.eq.f32 %p531, %f573, 0f00000000; @%p531 bra $L__BB0_383; add.f32 %f3632, %f4905, %f4907; sqrt.rn.f32 %f3633, %f573; div.rn.f32 %f3634, %f3632, %f3633; neg.f32 %f4907, %f4907; add.f32 %f3635, %f3634, %f3634; fma.rn.f32 %f3636, %f4906, %f3635, 0f00000000; mul.f32 %f3637, %f3634, %f3636; add.f32 %f3638, %f3637, 0f00000000; sub.f32 %f3639, %f4906, %f3637; sub.f32 %f3640, %f3639, %f3637; add.f32 %f3641, %f3638, %f3638; mul.f32 %f3642, %f3634, %f3641; fma.rn.f32 %f4906, %f3634, %f3642, %f3640; $L__BB0_383: abs.f32 %f578, %f4907; mov.b64 {%r250, %r1320}, %rd1185; mov.b32 %f4909, %r250; abs.f32 %f3643, %f578; abs.f32 %f3644, %f4909; abs.f32 %f3645, %f4906; add.f32 %f3646, %f3645, %f3644; mul.f32 %f3647, %f3646, 0f358637BD; setp.leu.f32 %p532, %f3643, %f3647; @%p532 bra $L__BB0_389; mov.b32 %r1321, %f578; cvt.u64.u32 %rd790, %r250; cvt.u64.u32 %rd791, %r1321; mov.b32 %r1322, %f4906; cvt.u64.u32 %rd792, %r1322; mov.u64 %rd789, 0; bfi.b64 %rd793, %rd792, %rd791, 32, 32; mov.b64 {%r1323, %r1324}, %rd793; bfi.b64 %rd794, %rd791, %rd790, 32, 32; mov.b64 {%r1325, %r1326}, %rd794; mov.b32 %f580, %r1325; mov.b32 %f3648, %r1326; mov.b32 %f3649, %r1323; mov.b32 %f581, %r1324; sub.f32 %f3650, %f580, %f581; mul.f32 %f3651, %f3650, 0f3F000000; mul.f32 %f3652, %f3651, %f3651; fma.rn.f32 %f582, %f3648, %f3649, %f3652; setp.ltu.f32 %p533, %f582, 0f00000000; mov.u64 %rd1188, %rd789; mov.u64 %rd1189, %rd789; mov.u64 %rd1190, %rd789; @%p533 bra $L__BB0_386; sqrt.rn.f32 %f3653, %f582; add.f32 %f3654, %f581, %f580; mul.f32 %f3655, %f3654, 0f3F000000; add.f32 %f3656, %f3655, %f3653; sub.f32 %f3657, %f3655, %f3653; mov.b32 %r1327, %f3656; mov.b32 %r1328, %f3657; cvt.u64.u32 %rd797, %r1328; cvt.u64.u32 %rd798, %r1327; bfi.b64 %rd799, %rd797, %rd798, 32, 32; shr.u64 %rd1189, %rd799, 32; shl.b64 %rd1188, %rd799, 32; mov.u64 %rd1190, 1; $L__BB0_386: or.b64 %rd256, %rd1190, %rd1188; or.b64 %rd257, %rd789, %rd1189; cvt.u32.u64 %r1329, %rd1188; cvt.u32.u64 %r1330, %rd1190; or.b32 %r1331, %r1330, %r1329; setp.eq.s32 %p534, %r1331, 0; @%p534 bra $L__BB0_388; mov.b64 {%r1332, %r1333}, %rd257; mov.b64 {%r1334, %r1335}, %rd256; mov.b32 %f4909, %r1335; mov.b32 %f4906, %r1332; $L__BB0_389: mul.f32 %f3658, %f567, %f4906; mul.f32 %f3659, %f567, %f4909; setp.le.f32 %p535, %f3659, %f3658; selp.f32 %f3660, %f3659, %f3658, %p535; setp.ge.f32 %p536, %f3659, %f3658; selp.f32 %f3661, %f3659, %f3658, %p536; ld.global.f32 %f3662, [%rd81+84]; setp.gt.f32 %p537, %f3661, %f3662; sub.f32 %f3663, %f3661, %f3660; mul.f32 %f3664, %f3663, 0f3F000000; ld.global.f32 %f3665, [%rd81+88]; setp.gt.f32 %p538, %f3664, %f3665; or.pred %p723, %p537, %p538; $L__BB0_390: selp.b32 %r8, 0, %r8, %p723; $L__BB0_391: mov.b32 %f588, %r8; and.b16 %rs33, %rs7, 3; setp.eq.s16 %p539, %rs33, 1; @%p539 bra $L__BB0_406; setp.eq.s16 %p540, %rs33, 2; mov.f32 %f4911, 0f3F800000; @%p540 bra $L__BB0_395; setp.ne.s16 %p541, %rs33, 3; @%p541 bra $L__BB0_416; mov.f32 %f4934, 0f00000000; mov.f32 %f4935, %f4934; mov.f32 %f4936, %f4934; mov.f32 %f4937, %f4934; bra.uni $L__BB0_448; $L__BB0_395: ld.global.f32 %f589, [%rd81+8]; div.rn.f32 %f3673, %f381, %f4875; div.rn.f32 %f590, %f3673, %f381; ld.global.u32 %r253, [%rd81+12]; cvt.rn.f32.s32 %f591, %r253; mul.f32 %f3674, %f591, 0f3F000000; cvt.rzi.f32.f32 %f3675, %f3674; add.f32 %f3676, %f3675, %f3675; sub.f32 %f3677, %f591, %f3676; abs.f32 %f592, %f3677; abs.f32 %f593, %f590; setp.lt.f32 %p542, %f593, 0f00800000; mul.f32 %f3678, %f593, 0f4B800000; selp.f32 %f3679, %f3678, %f593, %p542; selp.f32 %f3680, 0fC1C00000, 0f00000000, %p542; mov.b32 %r1336, %f3679; add.s32 %r1337, %r1336, -1060439283; and.b32 %r1338, %r1337, -8388608; sub.s32 %r1339, %r1336, %r1338; mov.b32 %f3681, %r1339; cvt.rn.f32.s32 %f3682, %r1338; mov.f32 %f3683, 0f34000000; fma.rn.f32 %f3684, %f3682, %f3683, %f3680; add.f32 %f3685, %f3681, 0fBF800000; add.f32 %f3671, %f3681, 0f3F800000; mov.f32 %f3672, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f3670,%f3671; // end inline asm add.f32 %f3686, %f3685, %f3685; mul.f32 %f3687, %f3670, %f3686; mul.f32 %f3688, %f3687, %f3687; sub.f32 %f3689, %f3685, %f3687; add.f32 %f3690, %f3689, %f3689; neg.f32 %f3691, %f3687; fma.rn.f32 %f3692, %f3691, %f3685, %f3690; mul.rn.f32 %f3693, %f3670, %f3692; mov.f32 %f3694, 0f3B52E7DB; mov.f32 %f3695, 0f3A2C32E4; fma.rn.f32 %f3696, %f3695, %f3688, %f3694; mov.f32 %f3697, 0f3C93BB73; fma.rn.f32 %f3698, %f3696, %f3688, %f3697; mov.f32 %f3699, 0f3DF6384F; fma.rn.f32 %f3700, %f3698, %f3688, %f3699; mul.rn.f32 %f3701, %f3700, %f3688; mov.f32 %f3702, 0f3FB8AA3B; fma.rn.f32 %f3703, %f3687, %f3702, %f3684; sub.f32 %f3704, %f3684, %f3703; fma.rn.f32 %f3705, %f3687, %f3702, %f3704; fma.rn.f32 %f3706, %f3693, %f3702, %f3705; mov.f32 %f3707, 0f32A55E34; fma.rn.f32 %f3708, %f3687, %f3707, %f3706; mul.f32 %f3709, %f3701, 0f40400000; fma.rn.f32 %f3710, %f3709, %f3693, %f3708; fma.rn.f32 %f3711, %f3701, %f3687, %f3710; add.rn.f32 %f3712, %f3703, %f3711; neg.f32 %f3713, %f3703; add.rn.f32 %f3714, %f3712, %f3713; neg.f32 %f3715, %f3714; add.rn.f32 %f3716, %f3711, %f3715; mul.rn.f32 %f3717, %f3712, %f591; neg.f32 %f3718, %f3717; fma.rn.f32 %f3719, %f3712, %f591, %f3718; fma.rn.f32 %f3720, %f3716, %f591, %f3719; cvt.rni.f32.f32 %f3721, %f3717; sub.f32 %f3722, %f3717, %f3721; add.f32 %f3723, %f3720, %f3722; mov.f32 %f3724, 0f3AAF85ED; mov.f32 %f3725, 0f391FCB8E; fma.rn.f32 %f3726, %f3725, %f3723, %f3724; mov.f32 %f3727, 0f3C1D9856; fma.rn.f32 %f3728, %f3726, %f3723, %f3727; mov.f32 %f3729, 0f3D6357BB; fma.rn.f32 %f3730, %f3728, %f3723, %f3729; mov.f32 %f3731, 0f3E75FDEC; fma.rn.f32 %f3732, %f3730, %f3723, %f3731; mov.f32 %f3733, 0f3F317218; fma.rn.f32 %f3734, %f3732, %f3723, %f3733; fma.rn.f32 %f3735, %f3734, %f3723, %f3672; cvt.rzi.s32.f32 %r1340, %f3721; setp.gt.f32 %p543, %f3721, 0f00000000; selp.b32 %r1341, 0, -2097152000, %p543; add.s32 %r1342, %r1341, 2130706432; mov.b32 %f3736, %r1342; mul.f32 %f3737, %f3735, %f3736; shl.b32 %r1343, %r1340, 23; sub.s32 %r1344, %r1343, %r1341; mov.b32 %f3738, %r1344; mul.f32 %f3739, %f3737, %f3738; abs.f32 %f3740, %f3717; setp.gt.f32 %p544, %f3740, 0f43180000; setp.lt.f32 %p545, %f3717, 0f00000000; selp.f32 %f3741, 0f00000000, 0f7F800000, %p545; selp.f32 %f594, %f3741, %f3739, %p544; setp.eq.f32 %p546, %f590, 0f3F800000; setp.eq.s32 %p547, %r253, 0; or.pred %p548, %p546, %p547; @%p548 bra $L__BB0_404; setp.gtu.f32 %p549, %f593, 0f7F800000; @%p549 bra $L__BB0_403; abs.f32 %f595, %f591; setp.gtu.f32 %p550, %f595, 0f7F800000; @%p550 bra $L__BB0_403; bra.uni $L__BB0_398; $L__BB0_403: add.rn.f32 %f4911, %f590, %f591; $L__BB0_404: add.f32 %f3747, %f4911, 0fBF800000; mul.f32 %f3748, %f589, %f3747; ld.global.f32 %f3749, [%rd81+20]; neg.f32 %f3750, %f3749; max.f32 %f3751, %f3748, %f3750; mul.f32 %f3752, %f4865, %f3751; neg.f32 %f3753, %f3752; mov.f32 %f3754, 0f00000000; st.local.v4.f32 [%rd1], {%f3754, %f3754, %f3754, %f3754}; mov.u64 %rd800, 0; st.local.v2.u64 [%rd89], {%rd800, %rd800}; mov.u32 %r1350, 1065353216; st.local.u32 [%rd89], %r1350; st.local.u32 [%rd89+12], %r1350; ld.local.v4.f32 {%f3755, %f3756, %f3757, %f3758}, [%rd89]; mul.f32 %f4934, %f3755, %f3753; mul.f32 %f4935, %f3756, %f3753; mul.f32 %f4936, %f3757, %f3753; mul.f32 %f4937, %f3758, %f3753; ld.global.f32 %f608, [%rd81+16]; setp.eq.f32 %p562, %f608, 0f00000000; @%p562 bra $L__BB0_448; add.f32 %f3763, %f4866, %f4866; add.f32 %f3764, %f4868, %f4867; add.f32 %f3765, %f4869, %f4869; mul.f32 %f3766, %f3764, 0f3F000000; mul.f32 %f3767, %f3765, 0f3F000000; mul.f32 %f3768, %f3763, 0f3F000000; add.f32 %f3769, %f3768, 0f00000000; add.f32 %f3770, %f3767, %f3769; mul.f32 %f3771, %f3770, 0f3F000000; st.local.v4.f32 [%rd1], {%f3768, %f3766, %f3766, %f3767}; sub.f32 %f3772, %f3768, %f3771; st.local.f32 [%rd1], %f3772; sub.f32 %f3773, %f3767, %f3771; st.local.f32 [%rd1+12], %f3773; ld.local.v4.f32 {%f3774, %f3775, %f3776, %f3777}, [%rd1]; add.f32 %f3778, %f608, %f608; mul.f32 %f3779, %f4865, %f3778; fma.rn.f32 %f4934, %f3779, %f3774, %f4934; fma.rn.f32 %f4935, %f3779, %f3775, %f4935; fma.rn.f32 %f4936, %f3779, %f3776, %f4936; fma.rn.f32 %f4937, %f3779, %f3777, %f4937; bra.uni $L__BB0_448; $L__BB0_406: ld.global.u64 %rd801, [%rd81+24]; shl.b64 %rd802, %rd70, 4; add.s64 %rd803, %rd801, %rd802; ld.f32 %f3787, [%rd803+8]; mul.f32 %f3788, %f588, 0f3F7FBE77; fma.rn.f32 %f613, %f3788, %f588, 0f3A83126F; mul.f32 %f3789, %f4875, %f4944; sub.f32 %f614, %f3789, %f379; ld.global.f32 %f3790, [%rd81+16]; mul.f32 %f3791, %f3790, 0f3F2AAAAB; ld.global.f32 %f3792, [%rd81+12]; mul.f32 %f3793, %f3787, %f3792; fma.rn.f32 %f615, %f3787, %f3791, %f3793; mul.f32 %f3794, %f4863, %f4863; fma.rn.f32 %f616, %f4875, %f4875, %f3794; mul.f32 %f3795, %f4863, %f4944; fma.rn.f32 %f617, %f4875, %f4864, %f3795; mul.f32 %f3796, %f4944, %f4944; fma.rn.f32 %f618, %f4864, %f4864, %f3796; mul.f32 %f619, %f3787, %f3790; mov.f32 %f3797, 0fBF000000; cvt.rzi.f32.f32 %f3798, %f3797; add.f32 %f3799, %f3798, %f3798; mov.f32 %f3800, 0fBF800000; sub.f32 %f3801, %f3800, %f3799; abs.f32 %f620, %f3801; abs.f32 %f621, %f614; setp.lt.f32 %p563, %f621, 0f00800000; mul.f32 %f3802, %f621, 0f4B800000; selp.f32 %f3803, %f3802, %f621, %p563; selp.f32 %f3804, 0fC1C00000, 0f00000000, %p563; mov.b32 %r1351, %f3803; add.s32 %r1352, %r1351, -1060439283; and.b32 %r1353, %r1352, -8388608; sub.s32 %r1354, %r1351, %r1353; mov.b32 %f3805, %r1354; cvt.rn.f32.s32 %f3806, %r1353; mov.f32 %f3807, 0f34000000; fma.rn.f32 %f3808, %f3806, %f3807, %f3804; add.f32 %f3809, %f3805, 0fBF800000; add.f32 %f3785, %f3805, 0f3F800000; mov.f32 %f4912, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f3784,%f3785; // end inline asm add.f32 %f3810, %f3809, %f3809; mul.f32 %f3811, %f3784, %f3810; mul.f32 %f3812, %f3811, %f3811; sub.f32 %f3813, %f3809, %f3811; add.f32 %f3814, %f3813, %f3813; neg.f32 %f3815, %f3811; fma.rn.f32 %f3816, %f3815, %f3809, %f3814; mul.rn.f32 %f3817, %f3784, %f3816; mov.f32 %f3818, 0f3B52E7DB; mov.f32 %f3819, 0f3A2C32E4; fma.rn.f32 %f3820, %f3819, %f3812, %f3818; mov.f32 %f3821, 0f3C93BB73; fma.rn.f32 %f3822, %f3820, %f3812, %f3821; mov.f32 %f3823, 0f3DF6384F; fma.rn.f32 %f3824, %f3822, %f3812, %f3823; mul.rn.f32 %f3825, %f3824, %f3812; mov.f32 %f3826, 0f3FB8AA3B; fma.rn.f32 %f3827, %f3811, %f3826, %f3808; sub.f32 %f3828, %f3808, %f3827; fma.rn.f32 %f3829, %f3811, %f3826, %f3828; fma.rn.f32 %f3830, %f3817, %f3826, %f3829; mov.f32 %f3831, 0f32A55E34; fma.rn.f32 %f3832, %f3811, %f3831, %f3830; mul.f32 %f3833, %f3825, 0f40400000; fma.rn.f32 %f3834, %f3833, %f3817, %f3832; fma.rn.f32 %f3835, %f3825, %f3811, %f3834; add.rn.f32 %f3836, %f3827, %f3835; neg.f32 %f3837, %f3827; add.rn.f32 %f3838, %f3836, %f3837; neg.f32 %f3839, %f3838; add.rn.f32 %f3840, %f3835, %f3839; mul.rn.f32 %f3841, %f3836, %f3800; neg.f32 %f3842, %f3841; fma.rn.f32 %f3843, %f3836, %f3800, %f3842; fma.rn.f32 %f3844, %f3840, %f3800, %f3843; cvt.rni.f32.f32 %f3845, %f3841; sub.f32 %f3846, %f3841, %f3845; add.f32 %f3847, %f3844, %f3846; mov.f32 %f3848, 0f3AAF85ED; mov.f32 %f3849, 0f391FCB8E; fma.rn.f32 %f3850, %f3849, %f3847, %f3848; mov.f32 %f3851, 0f3C1D9856; fma.rn.f32 %f3852, %f3850, %f3847, %f3851; mov.f32 %f3853, 0f3D6357BB; fma.rn.f32 %f3854, %f3852, %f3847, %f3853; mov.f32 %f3855, 0f3E75FDEC; fma.rn.f32 %f3856, %f3854, %f3847, %f3855; mov.f32 %f3857, 0f3F317218; fma.rn.f32 %f3858, %f3856, %f3847, %f3857; fma.rn.f32 %f3859, %f3858, %f3847, %f4912; cvt.rzi.s32.f32 %r1355, %f3845; setp.gt.f32 %p564, %f3845, 0f00000000; selp.b32 %r1356, 0, -2097152000, %p564; add.s32 %r1357, %r1356, 2130706432; mov.b32 %f3860, %r1357; mul.f32 %f3861, %f3859, %f3860; shl.b32 %r1358, %r1355, 23; sub.s32 %r1359, %r1358, %r1356; mov.b32 %f3862, %r1359; mul.f32 %f3863, %f3861, %f3862; abs.f32 %f3864, %f3841; setp.gt.f32 %p565, %f3864, 0f43180000; setp.lt.f32 %p566, %f3841, 0f00000000; selp.f32 %f3865, 0f00000000, 0f7F800000, %p566; selp.f32 %f622, %f3865, %f3863, %p565; setp.eq.f32 %p567, %f614, 0f3F800000; @%p567 bra $L__BB0_413; setp.gtu.f32 %p568, %f621, 0f7F800000; @%p568 bra $L__BB0_412; bra.uni $L__BB0_408; $L__BB0_412: mov.f32 %f3868, 0fBF800000; add.rn.f32 %f4912, %f614, %f3868; bra.uni $L__BB0_413; $L__BB0_416: ld.global.u64 %rd805, [%rd81+24]; shl.b64 %rd806, %rd70, 4; add.s64 %rd807, %rd805, %rd806; ld.f32 %f658, [%rd807+8]; mul.f32 %f3891, %f4875, %f4944; sub.f32 %f659, %f3891, %f379; ld.local.v4.f32 {%f4875, %f3893, %f3894, %f3895}, [%rd77]; add.f32 %f3897, %f3895, %f4875; mul.f32 %f661, %f3897, 0f3F000000; sub.f32 %f3898, %f4875, %f3895; mul.f32 %f3899, %f3898, 0f3F000000; add.f32 %f3902, %f3893, %f3894; mul.f32 %f3903, %f3902, 0f3F000000; sub.f32 %f3904, %f3893, %f3894; mul.f32 %f662, %f3904, 0f3F000000; mul.f32 %f3905, %f662, %f662; fma.rn.f32 %f3906, %f661, %f661, %f3905; sqrt.rn.f32 %f3907, %f3906; mul.f32 %f3908, %f3903, %f3903; fma.rn.f32 %f3909, %f3899, %f3899, %f3908; sqrt.rn.f32 %f3910, %f3909; add.f32 %f663, %f3907, %f3910; sub.f32 %f664, %f3907, %f3910; abs.f32 %f665, %f3899; abs.f32 %f666, %f3903; setp.eq.f32 %p576, %f665, 0f00000000; setp.eq.f32 %p577, %f666, 0f00000000; and.pred %p578, %p576, %p577; mov.b32 %r254, %f3899; mov.b32 %r1365, %f3903; and.b32 %r255, %r1365, -2147483648; @%p578 bra $L__BB0_420; bra.uni $L__BB0_417; $L__BB0_420: shr.s32 %r1370, %r254, 31; and.b32 %r1371, %r1370, 1078530011; or.b32 %r1372, %r1371, %r255; mov.b32 %f4921, %r1372; bra.uni $L__BB0_421; $L__BB0_417: setp.eq.f32 %p579, %f665, 0f7F800000; setp.eq.f32 %p580, %f666, 0f7F800000; and.pred %p581, %p579, %p580; @%p581 bra $L__BB0_419; bra.uni $L__BB0_418; $L__BB0_419: setp.lt.s32 %p585, %r254, 0; selp.b32 %r1368, 1075235812, 1061752795, %p585; or.b32 %r1369, %r1368, %r255; mov.b32 %f4921, %r1369; bra.uni $L__BB0_421; $L__BB0_418: setp.lt.s32 %p582, %r254, 0; min.f32 %f3911, %f666, %f665; max.f32 %f3912, %f666, %f665; div.rn.f32 %f3913, %f3911, %f3912; mul.rn.f32 %f3914, %f3913, %f3913; mov.f32 %f3915, 0fC0B59883; mov.f32 %f3916, 0fBF52C7EA; fma.rn.f32 %f3917, %f3914, %f3916, %f3915; mov.f32 %f3918, 0fC0D21907; fma.rn.f32 %f3919, %f3917, %f3914, %f3918; mul.f32 %f3920, %f3914, %f3919; mul.f32 %f3921, %f3913, %f3920; add.f32 %f3922, %f3914, 0f41355DC0; mov.f32 %f3923, 0f41E6BD60; fma.rn.f32 %f3924, %f3922, %f3914, %f3923; mov.f32 %f3925, 0f419D92C8; fma.rn.f32 %f3926, %f3924, %f3914, %f3925; rcp.rn.f32 %f3927, %f3926; fma.rn.f32 %f3928, %f3921, %f3927, %f3913; mov.f32 %f3929, 0f3FC90FDB; sub.f32 %f3930, %f3929, %f3928; setp.gt.f32 %p583, %f666, %f665; selp.f32 %f3931, %f3930, %f3928, %p583; mov.f32 %f3932, 0f40490FDB; sub.f32 %f3933, %f3932, %f3931; selp.f32 %f3934, %f3933, %f3931, %p582; mov.b32 %r1366, %f3934; or.b32 %r1367, %r255, %r1366; mov.b32 %f3935, %r1367; add.f32 %f3936, %f665, %f666; setp.le.f32 %p584, %f3936, 0f7F800000; selp.f32 %f4921, %f3935, %f3936, %p584; $L__BB0_421: abs.f32 %f671, %f661; setp.eq.f32 %p586, %f671, 0f00000000; abs.f32 %f672, %f662; setp.eq.f32 %p587, %f672, 0f00000000; and.pred %p588, %p586, %p587; mov.b32 %r256, %f661; mov.b32 %r1373, %f662; and.b32 %r257, %r1373, -2147483648; @%p588 bra $L__BB0_425; bra.uni $L__BB0_422; $L__BB0_425: shr.s32 %r1378, %r256, 31; and.b32 %r1379, %r1378, 1078530011; or.b32 %r1380, %r1379, %r257; mov.b32 %f4922, %r1380; bra.uni $L__BB0_426; $L__BB0_422: setp.eq.f32 %p589, %f671, 0f7F800000; setp.eq.f32 %p590, %f672, 0f7F800000; and.pred %p591, %p589, %p590; @%p591 bra $L__BB0_424; bra.uni $L__BB0_423; $L__BB0_424: setp.lt.s32 %p595, %r256, 0; selp.b32 %r1376, 1075235812, 1061752795, %p595; or.b32 %r1377, %r1376, %r257; mov.b32 %f4922, %r1377; bra.uni $L__BB0_426; $L__BB0_423: setp.lt.s32 %p592, %r256, 0; min.f32 %f3937, %f672, %f671; max.f32 %f3938, %f672, %f671; div.rn.f32 %f3939, %f3937, %f3938; mul.rn.f32 %f3940, %f3939, %f3939; mov.f32 %f3941, 0fC0B59883; mov.f32 %f3942, 0fBF52C7EA; fma.rn.f32 %f3943, %f3940, %f3942, %f3941; mov.f32 %f3944, 0fC0D21907; fma.rn.f32 %f3945, %f3943, %f3940, %f3944; mul.f32 %f3946, %f3940, %f3945; mul.f32 %f3947, %f3939, %f3946; add.f32 %f3948, %f3940, 0f41355DC0; mov.f32 %f3949, 0f41E6BD60; fma.rn.f32 %f3950, %f3948, %f3940, %f3949; mov.f32 %f3951, 0f419D92C8; fma.rn.f32 %f3952, %f3950, %f3940, %f3951; rcp.rn.f32 %f3953, %f3952; fma.rn.f32 %f3954, %f3947, %f3953, %f3939; mov.f32 %f3955, 0f3FC90FDB; sub.f32 %f3956, %f3955, %f3954; setp.gt.f32 %p593, %f672, %f671; selp.f32 %f3957, %f3956, %f3954, %p593; mov.f32 %f3958, 0f40490FDB; sub.f32 %f3959, %f3958, %f3957; selp.f32 %f3960, %f3959, %f3957, %p592; mov.b32 %r1374, %f3960; or.b32 %r1375, %r257, %r1374; mov.b32 %f3961, %r1375; add.f32 %f3962, %f671, %f672; setp.le.f32 %p594, %f3962, 0f7F800000; selp.f32 %f4922, %f3961, %f3962, %p594; $L__BB0_426: sub.f32 %f3963, %f4922, %f4921; mul.f32 %f677, %f3963, 0f3F000000; add.f32 %f3964, %f4921, %f4922; mul.f32 %f678, %f3964, 0f3F000000; mul.f32 %f3965, %f677, 0f3F22F983; cvt.rni.s32.f32 %r1699, %f3965; cvt.rn.f32.s32 %f3966, %r1699; mov.f32 %f3967, 0fBFC90FDA; fma.rn.f32 %f3968, %f3966, %f3967, %f677; mov.f32 %f3969, 0fB3A22168; fma.rn.f32 %f3970, %f3966, %f3969, %f3968; mov.f32 %f3971, 0fA7C234C5; fma.rn.f32 %f4923, %f3966, %f3971, %f3970; abs.f32 %f680, %f677; setp.ltu.f32 %p596, %f680, 0f47CE4780; @%p596 bra $L__BB0_434; setp.eq.f32 %p597, %f680, 0f7F800000; @%p597 bra $L__BB0_433; bra.uni $L__BB0_428; $L__BB0_433: mov.f32 %f3974, 0f00000000; mul.rn.f32 %f4923, %f677, %f3974; mov.u32 %r1699, 0; bra.uni $L__BB0_434; $L__BB0_408: setp.eq.f32 %p569, %f614, 0f00000000; setp.eq.f32 %p570, %f621, 0f7F800000; or.pred %p571, %p569, %p570; @%p571 bra $L__BB0_411; bra.uni $L__BB0_409; $L__BB0_411: setp.eq.f32 %p574, %f620, 0f3F800000; add.f32 %f3867, %f614, %f614; mov.b32 %r1360, %f3867; xor.b32 %r1361, %r1360, 2139095040; and.b32 %r1362, %r1361, 2147483647; selp.b32 %r1363, %r1361, %r1362, %p574; mov.b32 %f4912, %r1363; bra.uni $L__BB0_413; $L__BB0_428: mov.b32 %r259, %f677; shr.u32 %r1383, %r259, 23; and.b32 %r1384, %r1383, 255; add.s32 %r260, %r1384, -128; shl.b32 %r1385, %r259, 8; or.b32 %r261, %r1385, -2147483648; shr.u32 %r262, %r260, 5; mov.u32 %r1695, 0; mov.u64 %rd1191, __cudart_i2opi_f; mov.u64 %rd1192, %rd5; mov.u32 %r1696, %r1695; $L__BB0_429: .pragma "nounroll"; mov.u32 %r264, %r1696; ld.global.nc.u32 %r1388, [%rd1191]; // begin inline asm { mad.lo.cc.u32 %r1386, %r1388, %r261, %r264; madc.hi.u32 %r1696, %r1388, %r261, 0; } // end inline asm st.local.u32 [%rd1192], %r1386; add.s64 %rd1192, %rd1192, 4; add.s64 %rd1191, %rd1191, 4; add.s32 %r1695, %r1695, 1; setp.ne.s32 %p598, %r1695, 6; @%p598 bra $L__BB0_429; add.s64 %rd1100, %rd5, 24; mov.u32 %r1393, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1391, %r1393, %r261, %r264; madc.hi.u32 %r1392, %r1393, %r261, 0; } // end inline asm st.local.u32 [%rd1100], %r1392; mov.u32 %r1396, 4; sub.s32 %r267, %r1396, %r262; mov.u32 %r1397, 6; sub.s32 %r1398, %r1397, %r262; mul.wide.s32 %rd809, %r1398, 4; add.s64 %rd810, %rd5, %rd809; ld.local.u32 %r1697, [%rd810]; ld.local.u32 %r1698, [%rd810+-4]; and.b32 %r270, %r260, 31; setp.eq.s32 %p599, %r270, 0; @%p599 bra $L__BB0_432; mov.u32 %r1399, 32; sub.s32 %r1400, %r1399, %r270; shr.u32 %r1401, %r1698, %r1400; shl.b32 %r1402, %r1697, %r270; add.s32 %r1697, %r1401, %r1402; mul.wide.s32 %rd811, %r267, 4; add.s64 %rd812, %rd5, %rd811; ld.local.u32 %r1403, [%rd812]; shr.u32 %r1404, %r1403, %r1400; shl.b32 %r1405, %r1698, %r270; add.s32 %r1698, %r1404, %r1405; $L__BB0_432: and.b32 %r1406, %r259, -2147483648; shr.u32 %r1407, %r1698, 30; shl.b32 %r1408, %r1697, 2; or.b32 %r1409, %r1407, %r1408; shr.u32 %r1410, %r1409, 31; shr.u32 %r1411, %r1697, 30; add.s32 %r1412, %r1410, %r1411; neg.s32 %r1413, %r1412; setp.eq.s32 %p600, %r1406, 0; selp.b32 %r1699, %r1412, %r1413, %p600; setp.ne.s32 %p601, %r1410, 0; xor.b32 %r1414, %r1406, -2147483648; selp.b32 %r1415, %r1414, %r1406, %p601; selp.b32 %r1416, -1, 0, %p601; xor.b32 %r1417, %r1409, %r1416; shl.b32 %r1418, %r1698, 2; xor.b32 %r1419, %r1418, %r1416; cvt.u64.u32 %rd813, %r1417; cvt.u64.u32 %rd814, %r1419; bfi.b64 %rd815, %rd813, %rd814, 32, 32; cvt.rn.f64.s64 %fd23, %rd815; mul.f64 %fd24, %fd23, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f3972, %fd24; setp.eq.s32 %p602, %r1415, 0; neg.f32 %f3973, %f3972; selp.f32 %f4923, %f3972, %f3973, %p602; $L__BB0_434: mul.f32 %f3975, %f678, 0f3F22F983; cvt.rni.s32.f32 %r1704, %f3975; cvt.rn.f32.s32 %f3976, %r1704; fma.rn.f32 %f3978, %f3976, %f3967, %f678; fma.rn.f32 %f3980, %f3976, %f3969, %f3978; fma.rn.f32 %f4924, %f3976, %f3971, %f3980; abs.f32 %f685, %f678; setp.ltu.f32 %p603, %f685, 0f47CE4780; @%p603 bra $L__BB0_442; setp.eq.f32 %p604, %f685, 0f7F800000; @%p604 bra $L__BB0_441; bra.uni $L__BB0_436; $L__BB0_441: mov.f32 %f3984, 0f00000000; mul.rn.f32 %f4924, %f678, %f3984; mov.u32 %r1704, 0; bra.uni $L__BB0_442; $L__BB0_436: mov.b32 %r278, %f678; shr.u32 %r1423, %r278, 23; and.b32 %r1424, %r1423, 255; add.s32 %r279, %r1424, -128; shl.b32 %r1425, %r278, 8; or.b32 %r280, %r1425, -2147483648; shr.u32 %r281, %r279, 5; mov.u32 %r1700, 0; mov.u64 %rd1193, __cudart_i2opi_f; mov.u64 %rd1194, %rd5; mov.u32 %r1701, %r1700; $L__BB0_437: .pragma "nounroll"; mov.u32 %r283, %r1701; ld.global.nc.u32 %r1428, [%rd1193]; // begin inline asm { mad.lo.cc.u32 %r1426, %r1428, %r280, %r283; madc.hi.u32 %r1701, %r1428, %r280, 0; } // end inline asm st.local.u32 [%rd1194], %r1426; add.s64 %rd1194, %rd1194, 4; add.s64 %rd1193, %rd1193, 4; add.s32 %r1700, %r1700, 1; setp.ne.s32 %p605, %r1700, 6; @%p605 bra $L__BB0_437; add.s64 %rd1101, %rd5, 24; mov.u32 %r1433, -1560706194; // begin inline asm { mad.lo.cc.u32 %r1431, %r1433, %r280, %r283; madc.hi.u32 %r1432, %r1433, %r280, 0; } // end inline asm st.local.u32 [%rd1101], %r1432; mov.u32 %r1436, 4; sub.s32 %r286, %r1436, %r281; mov.u32 %r1437, 6; sub.s32 %r1438, %r1437, %r281; mul.wide.s32 %rd817, %r1438, 4; add.s64 %rd818, %rd5, %rd817; ld.local.u32 %r1702, [%rd818]; ld.local.u32 %r1703, [%rd818+-4]; and.b32 %r289, %r279, 31; setp.eq.s32 %p606, %r289, 0; @%p606 bra $L__BB0_440; mov.u32 %r1439, 32; sub.s32 %r1440, %r1439, %r289; shr.u32 %r1441, %r1703, %r1440; shl.b32 %r1442, %r1702, %r289; add.s32 %r1702, %r1441, %r1442; mul.wide.s32 %rd819, %r286, 4; add.s64 %rd820, %rd5, %rd819; ld.local.u32 %r1443, [%rd820]; shr.u32 %r1444, %r1443, %r1440; shl.b32 %r1445, %r1703, %r289; add.s32 %r1703, %r1444, %r1445; $L__BB0_440: and.b32 %r1446, %r278, -2147483648; shr.u32 %r1447, %r1703, 30; shl.b32 %r1448, %r1702, 2; or.b32 %r1449, %r1447, %r1448; shr.u32 %r1450, %r1449, 31; shr.u32 %r1451, %r1702, 30; add.s32 %r1452, %r1450, %r1451; neg.s32 %r1453, %r1452; setp.eq.s32 %p607, %r1446, 0; selp.b32 %r1704, %r1452, %r1453, %p607; setp.ne.s32 %p608, %r1450, 0; xor.b32 %r1454, %r1446, -2147483648; selp.b32 %r1455, %r1454, %r1446, %p608; selp.b32 %r1456, -1, 0, %p608; xor.b32 %r1457, %r1449, %r1456; shl.b32 %r1458, %r1703, 2; xor.b32 %r1459, %r1458, %r1456; cvt.u64.u32 %rd821, %r1457; cvt.u64.u32 %rd822, %r1459; bfi.b64 %rd823, %rd821, %rd822, 32, 32; cvt.rn.f64.s64 %fd25, %rd823; mul.f64 %fd26, %fd25, 0d3BF921FB54442D19; cvt.rn.f32.f64 %f3982, %fd26; setp.eq.s32 %p609, %r1455, 0; neg.f32 %f3983, %f3982; selp.f32 %f4924, %f3982, %f3983, %p609; $L__BB0_442: setp.lt.f32 %p610, %f664, 0f00000000; mov.f32 %f3985, 0f00000000; selp.f32 %f3986, 0fBF800000, 0f3F800000, %p610; mov.f32 %f3987, 0f3F800000; mul.f32 %f3988, %f4923, %f4923; mov.f32 %f3989, 0fBAB607ED; mov.f32 %f3990, 0f37CBAC00; fma.rn.f32 %f3991, %f3990, %f3988, %f3989; mov.f32 %f3992, 0f3D2AAABB; fma.rn.f32 %f3993, %f3991, %f3988, %f3992; mov.f32 %f3994, 0fBEFFFFFF; fma.rn.f32 %f3995, %f3993, %f3988, %f3994; fma.rn.f32 %f3996, %f3995, %f3988, %f3987; mov.f32 %f3997, 0f3C0885E4; mov.f32 %f3998, 0fB94D4153; fma.rn.f32 %f3999, %f3998, %f3988, %f3997; mov.f32 %f4000, 0fBE2AAAA8; fma.rn.f32 %f4001, %f3999, %f3988, %f4000; fma.rn.f32 %f4002, %f3988, %f4923, %f3985; fma.rn.f32 %f4003, %f4001, %f4002, %f4923; and.b32 %r1461, %r1699, 1; setp.eq.b32 %p611, %r1461, 1; selp.f32 %f4004, %f3996, %f4003, %p611; selp.f32 %f4005, %f4003, %f3996, %p611; neg.f32 %f4006, %f4004; and.b32 %r1462, %r1699, 2; setp.eq.s32 %p612, %r1462, 0; selp.f32 %f4007, %f4004, %f4006, %p612; neg.f32 %f4008, %f4005; add.s32 %r1463, %r1699, 1; and.b32 %r1464, %r1463, 2; setp.eq.s32 %p613, %r1464, 0; selp.f32 %f4009, %f4005, %f4008, %p613; mul.f32 %f4010, %f4924, %f4924; fma.rn.f32 %f4011, %f3990, %f4010, %f3989; fma.rn.f32 %f4012, %f4011, %f4010, %f3992; fma.rn.f32 %f4013, %f4012, %f4010, %f3994; fma.rn.f32 %f4014, %f4013, %f4010, %f3987; fma.rn.f32 %f4015, %f4010, %f4924, %f3985; fma.rn.f32 %f4016, %f3998, %f4010, %f3997; fma.rn.f32 %f4017, %f4016, %f4010, %f4000; fma.rn.f32 %f4018, %f4017, %f4015, %f4924; and.b32 %r1465, %r1704, 1; setp.eq.b32 %p614, %r1465, 1; selp.f32 %f4019, %f4014, %f4018, %p614; selp.f32 %f4020, %f4018, %f4014, %p614; and.b32 %r1466, %r1704, 2; setp.eq.s32 %p615, %r1466, 0; neg.f32 %f4021, %f4019; selp.f32 %f4022, %f4019, %f4021, %p615; add.s32 %r1467, %r1704, 1; and.b32 %r1468, %r1467, 2; setp.eq.s32 %p616, %r1468, 0; neg.f32 %f4023, %f4020; selp.f32 %f4024, %f4020, %f4023, %p616; mov.b32 %r1469, %f4024; neg.f32 %f4025, %f4022; mov.b32 %r1470, %f4022; cvt.u64.u32 %rd824, %r1470; mov.b32 %r1471, %f4025; cvt.u64.u32 %rd825, %r1471; cvt.u64.u32 %rd826, %r1469; bfi.b64 %rd827, %rd826, %rd825, 32, 32; mov.b64 {%r1472, %r1473}, %rd827; bfi.b64 %rd828, %rd824, %rd826, 32, 32; mov.b64 {%r1474, %r1475}, %rd828; mul.f32 %f4026, %f3986, %f4007; mov.b32 %r1476, %f4026; cvt.u64.u32 %rd829, %r1476; mov.b32 %r1477, %f4009; cvt.u64.u32 %rd830, %r1477; neg.f32 %f4027, %f4007; mov.b32 %r1478, %f4027; mul.f32 %f4028, %f3986, %f4009; mov.b32 %r1479, %f4028; cvt.u64.u32 %rd831, %r1479; cvt.u64.u32 %rd832, %r1478; bfi.b64 %rd833, %rd831, %rd832, 32, 32; mov.b64 {%r1480, %r1481}, %rd833; bfi.b64 %rd834, %rd829, %rd830, 32, 32; mov.b64 {%r1482, %r1483}, %rd834; add.f32 %f689, %f663, 0fBF800000; fma.rn.f32 %f690, %f664, %f3986, 0fBF800000; mov.b32 %f691, %r1474; mov.b32 %f692, %r1475; mov.b32 %f693, %r1472; mov.b32 %f694, %r1473; mov.b32 %f695, %r1482; mov.b32 %f696, %r1483; mov.b32 %f697, %r1480; mov.b32 %f698, %r1481; add.f32 %f699, %f659, 0fBF800000; setp.eq.f32 %p617, %f588, 0f3F800000; @%p617 bra $L__BB0_447; bra.uni $L__BB0_443; $L__BB0_447: ld.global.f32 %f4090, [%rd81+20]; add.f32 %f4091, %f4090, %f4090; mul.f32 %f4092, %f658, %f4091; mul.f32 %f4093, %f689, %f691; mul.f32 %f4094, %f689, %f692; mul.f32 %f4095, %f690, %f693; mul.f32 %f4096, %f696, %f4095; fma.rn.f32 %f4097, %f695, %f4093, %f4096; mul.f32 %f4098, %f690, %f694; mul.f32 %f4099, %f696, %f4098; fma.rn.f32 %f4100, %f695, %f4094, %f4099; mul.f32 %f4101, %f698, %f4095; fma.rn.f32 %f4102, %f697, %f4093, %f4101; mul.f32 %f4103, %f698, %f4098; fma.rn.f32 %f4104, %f697, %f4094, %f4103; mul.f32 %f4105, %f4097, %f4092; mul.f32 %f4106, %f4100, %f4092; mul.f32 %f4107, %f4102, %f4092; mul.f32 %f4108, %f4104, %f4092; mul.f32 %f4109, %f4863, %f4107; fma.rn.f32 %f4110, %f4875, %f4105, %f4109; mul.f32 %f4111, %f4863, %f4108; fma.rn.f32 %f4112, %f4875, %f4106, %f4111; mul.f32 %f4113, %f4107, %f4944; fma.rn.f32 %f4114, %f4105, %f4864, %f4113; mul.f32 %f4115, %f4108, %f4944; fma.rn.f32 %f4116, %f4106, %f4864, %f4115; ld.global.f32 %f4117, [%rd81+16]; mul.f32 %f4118, %f658, %f4117; mul.f32 %f4119, %f699, %f4118; mul.f32 %f4120, %f659, %f4119; mov.u64 %rd836, 0; st.local.v2.u64 [%rd1], {%rd836, %rd836}; mov.u32 %r1485, 1065353216; st.local.u32 [%rd1], %r1485; st.local.u32 [%rd1+12], %r1485; ld.local.v4.f32 {%f4121, %f4122, %f4123, %f4124}, [%rd1]; fma.rn.f32 %f4937, %f4120, %f4124, %f4116; fma.rn.f32 %f4936, %f4120, %f4123, %f4114; fma.rn.f32 %f4935, %f4120, %f4122, %f4112; fma.rn.f32 %f4934, %f4120, %f4121, %f4110; st.local.v4.f32 [%rd89], {%f4934, %f4935, %f4936, %f4937}; bra.uni $L__BB0_448; $L__BB0_443: ld.global.f32 %f4029, [%rd81+20]; add.f32 %f4030, %f4029, %f4029; mul.f32 %f4031, %f658, %f4030; max.f32 %f4033, %f689, %f3985; mul.f32 %f4034, %f691, %f4033; mul.f32 %f4035, %f692, %f4033; max.f32 %f4036, %f690, %f3985; mul.f32 %f4037, %f693, %f4036; mul.f32 %f4038, %f694, %f4036; mul.f32 %f4039, %f696, %f4037; fma.rn.f32 %f4040, %f695, %f4034, %f4039; mul.f32 %f4041, %f696, %f4038; fma.rn.f32 %f4042, %f695, %f4035, %f4041; mul.f32 %f4043, %f698, %f4037; fma.rn.f32 %f4044, %f697, %f4034, %f4043; mul.f32 %f4045, %f698, %f4038; fma.rn.f32 %f4046, %f697, %f4035, %f4045; mul.f32 %f4047, %f4040, %f4031; mul.f32 %f4048, %f4042, %f4031; mul.f32 %f4049, %f4044, %f4031; mul.f32 %f4050, %f4046, %f4031; mul.f32 %f4051, %f4863, %f4049; fma.rn.f32 %f4925, %f4875, %f4047, %f4051; mul.f32 %f4052, %f4863, %f4050; fma.rn.f32 %f4926, %f4875, %f4048, %f4052; mul.f32 %f4053, %f4049, %f4944; fma.rn.f32 %f4927, %f4047, %f4864, %f4053; mul.f32 %f4054, %f4050, %f4944; fma.rn.f32 %f4928, %f4048, %f4864, %f4054; min.f32 %f4055, %f689, %f3985; mul.f32 %f4056, %f691, %f4055; mul.f32 %f4057, %f692, %f4055; min.f32 %f4058, %f690, %f3985; mul.f32 %f4059, %f693, %f4058; mul.f32 %f4060, %f694, %f4058; mul.f32 %f4061, %f696, %f4059; fma.rn.f32 %f4062, %f695, %f4056, %f4061; mul.f32 %f4063, %f696, %f4060; fma.rn.f32 %f4064, %f695, %f4057, %f4063; mul.f32 %f4065, %f698, %f4059; fma.rn.f32 %f4066, %f697, %f4056, %f4065; mul.f32 %f4067, %f698, %f4060; fma.rn.f32 %f4068, %f697, %f4057, %f4067; mul.f32 %f4069, %f4031, %f4062; mul.f32 %f4070, %f4031, %f4064; mul.f32 %f4071, %f4031, %f4066; mul.f32 %f4072, %f4031, %f4068; mul.f32 %f4073, %f4863, %f4071; fma.rn.f32 %f4929, %f4875, %f4069, %f4073; mul.f32 %f4074, %f4863, %f4072; fma.rn.f32 %f4930, %f4875, %f4070, %f4074; mul.f32 %f4075, %f4071, %f4944; fma.rn.f32 %f4931, %f4069, %f4864, %f4075; mul.f32 %f4076, %f4072, %f4944; fma.rn.f32 %f4932, %f4070, %f4864, %f4076; ld.global.f32 %f4077, [%rd81+16]; mul.f32 %f4078, %f658, %f4077; mul.f32 %f4079, %f699, %f4078; mul.f32 %f4080, %f659, %f4079; st.local.v4.f32 [%rd1], {%f3985, %f3985, %f3985, %f3985}; mov.u64 %rd835, 0; st.local.v2.u64 [%rd89], {%rd835, %rd835}; mov.u32 %r1484, 1065353216; st.local.u32 [%rd89], %r1484; st.local.u32 [%rd89+12], %r1484; ld.local.v4.f32 {%f4081, %f4082, %f4083, %f4084}, [%rd89]; mul.f32 %f708, %f4080, %f4081; mul.f32 %f709, %f4080, %f4082; mul.f32 %f710, %f4080, %f4083; mul.f32 %f711, %f4080, %f4084; setp.lt.f32 %p618, %f659, 0f3F800000; @%p618 bra $L__BB0_445; bra.uni $L__BB0_444; $L__BB0_445: add.f32 %f4929, %f4929, %f708; add.f32 %f4930, %f4930, %f709; add.f32 %f4931, %f4931, %f710; add.f32 %f4932, %f4932, %f711; bra.uni $L__BB0_446; $L__BB0_444: add.f32 %f4925, %f4925, %f708; add.f32 %f4926, %f4926, %f709; add.f32 %f4927, %f4927, %f710; add.f32 %f4928, %f4928, %f711; $L__BB0_446: ld.global.u8 %rs34, [%rd81+8]; setp.ne.s16 %p619, %rs34, 0; setp.eq.f32 %p620, %f588, 0f00000000; and.pred %p621, %p620, %p619; selp.f32 %f4089, 0f00000000, 0f3F800000, %p621; fma.rn.f32 %f4934, %f4925, %f4089, %f4929; fma.rn.f32 %f4935, %f4926, %f4089, %f4930; fma.rn.f32 %f4936, %f4927, %f4089, %f4931; fma.rn.f32 %f4937, %f4928, %f4089, %f4932; bra.uni $L__BB0_448; $L__BB0_398: setp.eq.f32 %p551, %f590, 0f00000000; setp.eq.f32 %p552, %f593, 0f7F800000; or.pred %p553, %p551, %p552; @%p553 bra $L__BB0_402; bra.uni $L__BB0_399; $L__BB0_402: setp.eq.f32 %p560, %f592, 0f3F800000; add.f32 %f3746, %f590, %f590; mov.b32 %r1345, %f3746; xor.b32 %r1346, %r1345, 2139095040; setp.lt.s32 %p561, %r253, 0; selp.b32 %r1347, %r1346, %r1345, %p561; and.b32 %r1348, %r1347, 2147483647; selp.b32 %r1349, %r1347, %r1348, %p560; mov.b32 %f4911, %r1349; bra.uni $L__BB0_404; $L__BB0_409: setp.geu.f32 %p572, %f614, 0f00000000; mov.f32 %f4912, %f622; @%p572 bra $L__BB0_413; setp.eq.f32 %p573, %f620, 0f3F800000; neg.f32 %f3866, %f622; selp.f32 %f4912, %f3866, %f622, %p573; $L__BB0_413: add.f32 %f3869, %f616, 0f00000000; add.f32 %f3870, %f3869, %f618; mul.f32 %f3871, %f3870, 0f3F000000; sub.f32 %f3872, %f616, %f3871; sub.f32 %f3873, %f618, %f3871; mul.f32 %f3874, %f619, %f4912; mul.f32 %f4913, %f3872, %f3874; mul.f32 %f4914, %f617, %f3874; mul.f32 %f4916, %f3873, %f3874; fma.rn.f32 %f3875, %f614, %f614, 0fBF800000; mul.f32 %f3876, %f615, 0f3F000000; mul.f32 %f3877, %f3875, %f3876; mov.f32 %f3878, 0f00000000; st.local.v4.f32 [%rd1], {%f3878, %f3878, %f3878, %f3878}; mov.u64 %rd804, 0; st.local.v2.u64 [%rd89], {%rd804, %rd804}; mov.u32 %r1364, 1065353216; st.local.u32 [%rd89], %r1364; st.local.u32 [%rd89+12], %r1364; ld.local.v4.f32 {%f3879, %f3880, %f3881, %f3882}, [%rd89]; mul.f32 %f4917, %f3877, %f3879; mul.f32 %f4918, %f3877, %f3880; mul.f32 %f4919, %f3877, %f3881; mul.f32 %f4920, %f3877, %f3882; setp.ltu.f32 %p575, %f614, 0f3F800000; mov.f32 %f4915, %f4914; @%p575 bra $L__BB0_415; add.f32 %f4913, %f4913, %f4917; add.f32 %f643, %f4914, %f4918; add.f32 %f4915, %f4914, %f4919; add.f32 %f4916, %f4916, %f4920; st.local.v4.f32 [%rd1], {%f3878, %f3878, %f3878, %f3878}; mov.f32 %f4914, %f643; mov.f32 %f4917, %f3878; mov.f32 %f4918, %f3878; mov.f32 %f4919, %f3878; mov.f32 %f4920, %f3878; $L__BB0_415: fma.rn.f32 %f4934, %f613, %f4913, %f4917; fma.rn.f32 %f4935, %f613, %f4914, %f4918; fma.rn.f32 %f4936, %f613, %f4915, %f4919; fma.rn.f32 %f4937, %f613, %f4916, %f4920; $L__BB0_448: ld.param.f32 %f4780, [g2p2g_param_11]; div.rn.f32 %f4132, %f79, %f4780; mov.b32 %r1486, %f4132; and.b32 %r1487, %r1486, -2147483648; or.b32 %r1488, %r1487, 1056964608; mov.b32 %f4133, %r1488; add.rz.f32 %f4134, %f4132, %f4133; cvt.rzi.f32.f32 %f749, %f4134; div.rn.f32 %f4135, %f80, %f4780; mov.b32 %r1489, %f4135; and.b32 %r1490, %r1489, -2147483648; or.b32 %r1491, %r1490, 1056964608; mov.b32 %f4136, %r1491; add.rz.f32 %f4137, %f4135, %f4136; cvt.rzi.f32.f32 %f750, %f4137; add.f32 %f4138, %f749, 0fBF800000; add.f32 %f4139, %f750, 0fBF800000; mul.f32 %f4140, %f4780, %f4138; mul.f32 %f4141, %f4780, %f4139; sub.f32 %f751, %f4140, %f79; sub.f32 %f752, %f4141, %f80; neg.f32 %f4142, %f751; div.rn.f32 %f753, %f4142, %f4780; mov.f32 %f4143, 0f3FC00000; sub.f32 %f754, %f4143, %f753; abs.f32 %f755, %f754; setp.lt.f32 %p622, %f755, 0f00800000; mul.f32 %f4144, %f755, 0f4B800000; selp.f32 %f4145, %f4144, %f755, %p622; selp.f32 %f4146, 0fC1C00000, 0f00000000, %p622; mov.b32 %r1492, %f4145; add.s32 %r1493, %r1492, -1060439283; and.b32 %r1494, %r1493, -8388608; sub.s32 %r1495, %r1492, %r1494; mov.b32 %f4147, %r1495; cvt.rn.f32.s32 %f4148, %r1494; mov.f32 %f4149, 0f34000000; fma.rn.f32 %f4150, %f4148, %f4149, %f4146; add.f32 %f4151, %f4147, 0fBF800000; add.f32 %f4130, %f4147, 0f3F800000; mov.f32 %f4939, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f4129,%f4130; // end inline asm add.f32 %f4152, %f4151, %f4151; mov.f32 %f4153, 0f40000000; mul.f32 %f4154, %f4129, %f4152; mul.f32 %f4155, %f4154, %f4154; sub.f32 %f4156, %f4151, %f4154; add.f32 %f4157, %f4156, %f4156; neg.f32 %f4158, %f4154; fma.rn.f32 %f4159, %f4158, %f4151, %f4157; mul.rn.f32 %f4160, %f4129, %f4159; mov.f32 %f4161, 0f3B52E7DB; mov.f32 %f4162, 0f3A2C32E4; fma.rn.f32 %f4163, %f4162, %f4155, %f4161; mov.f32 %f4164, 0f3C93BB73; fma.rn.f32 %f4165, %f4163, %f4155, %f4164; mov.f32 %f4166, 0f3DF6384F; fma.rn.f32 %f4167, %f4165, %f4155, %f4166; mul.rn.f32 %f4168, %f4167, %f4155; mov.f32 %f4169, 0f3FB8AA3B; fma.rn.f32 %f4170, %f4154, %f4169, %f4150; sub.f32 %f4171, %f4150, %f4170; fma.rn.f32 %f4172, %f4154, %f4169, %f4171; fma.rn.f32 %f4173, %f4160, %f4169, %f4172; mov.f32 %f4174, 0f32A55E34; fma.rn.f32 %f4175, %f4154, %f4174, %f4173; mul.f32 %f4176, %f4168, 0f40400000; fma.rn.f32 %f4177, %f4176, %f4160, %f4175; fma.rn.f32 %f4178, %f4168, %f4154, %f4177; add.rn.f32 %f4179, %f4170, %f4178; neg.f32 %f4180, %f4170; add.rn.f32 %f4181, %f4179, %f4180; neg.f32 %f4182, %f4181; add.rn.f32 %f4183, %f4178, %f4182; mul.rn.f32 %f4184, %f4179, %f4153; neg.f32 %f4185, %f4184; fma.rn.f32 %f4186, %f4179, %f4153, %f4185; fma.rn.f32 %f4187, %f4183, %f4153, %f4186; cvt.rni.f32.f32 %f4188, %f4184; sub.f32 %f4189, %f4184, %f4188; add.f32 %f4190, %f4187, %f4189; mov.f32 %f4191, 0f3AAF85ED; mov.f32 %f4192, 0f391FCB8E; fma.rn.f32 %f4193, %f4192, %f4190, %f4191; mov.f32 %f4194, 0f3C1D9856; fma.rn.f32 %f4195, %f4193, %f4190, %f4194; mov.f32 %f4196, 0f3D6357BB; fma.rn.f32 %f4197, %f4195, %f4190, %f4196; mov.f32 %f4198, 0f3E75FDEC; fma.rn.f32 %f4199, %f4197, %f4190, %f4198; mov.f32 %f4200, 0f3F317218; fma.rn.f32 %f4201, %f4199, %f4190, %f4200; fma.rn.f32 %f4202, %f4201, %f4190, %f4939; cvt.rzi.s32.f32 %r1496, %f4188; setp.gt.f32 %p623, %f4188, 0f00000000; selp.b32 %r1497, 0, -2097152000, %p623; add.s32 %r1498, %r1497, 2130706432; mov.b32 %f4203, %r1498; mul.f32 %f4204, %f4202, %f4203; shl.b32 %r1499, %r1496, 23; sub.s32 %r1500, %r1499, %r1497; mov.b32 %f4205, %r1500; mul.f32 %f4206, %f4204, %f4205; abs.f32 %f4207, %f4184; setp.gt.f32 %p624, %f4207, 0f43180000; setp.lt.f32 %p625, %f4184, 0f00000000; selp.f32 %f4208, 0f00000000, 0f7F800000, %p625; selp.f32 %f756, %f4208, %f4206, %p624; setp.eq.f32 %p626, %f754, 0f3F800000; mov.f32 %f4938, %f4939; @%p626 bra $L__BB0_455; mov.f32 %f4807, 0f3FC00000; sub.f32 %f4806, %f4807, %f753; abs.f32 %f4805, %f4806; setp.gtu.f32 %p627, %f4805, 0f7F800000; @%p627 bra $L__BB0_454; bra.uni $L__BB0_450; $L__BB0_454: mov.f32 %f4211, 0f40000000; add.rn.f32 %f4938, %f754, %f4211; bra.uni $L__BB0_455; $L__BB0_450: mov.f32 %f4810, 0f3FC00000; sub.f32 %f4809, %f4810, %f753; abs.f32 %f4808, %f4809; setp.eq.f32 %p628, %f4809, 0f00000000; setp.eq.f32 %p629, %f4808, 0f7F800000; or.pred %p630, %p628, %p629; @%p630 bra $L__BB0_453; bra.uni $L__BB0_451; $L__BB0_453: setp.eq.f32 %p633, %f19, 0f3F800000; add.f32 %f4210, %f754, %f754; mov.b32 %r1501, %f4210; and.b32 %r1502, %r1501, 2147483647; selp.b32 %r1503, %r1501, %r1502, %p633; mov.b32 %f4938, %r1503; bra.uni $L__BB0_455; $L__BB0_451: setp.geu.f32 %p631, %f754, 0f00000000; mov.f32 %f4938, %f756; @%p631 bra $L__BB0_455; setp.eq.f32 %p632, %f19, 0f3F800000; neg.f32 %f4209, %f756; selp.f32 %f4938, %f4209, %f756, %p632; $L__BB0_455: mul.f32 %f761, %f4938, 0f3F000000; add.f32 %f762, %f753, 0fBF800000; abs.f32 %f763, %f762; setp.lt.f32 %p634, %f763, 0f00800000; mul.f32 %f4215, %f763, 0f4B800000; selp.f32 %f4216, %f4215, %f763, %p634; selp.f32 %f4217, 0fC1C00000, 0f00000000, %p634; mov.b32 %r1504, %f4216; add.s32 %r1505, %r1504, -1060439283; and.b32 %r1506, %r1505, -8388608; sub.s32 %r1507, %r1504, %r1506; mov.b32 %f4218, %r1507; cvt.rn.f32.s32 %f4219, %r1506; fma.rn.f32 %f4221, %f4219, %f4149, %f4217; add.f32 %f4222, %f4218, 0fBF800000; add.f32 %f4213, %f4218, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f4212,%f4213; // end inline asm add.f32 %f4223, %f4222, %f4222; mul.f32 %f4225, %f4212, %f4223; mul.f32 %f4226, %f4225, %f4225; sub.f32 %f4227, %f4222, %f4225; add.f32 %f4228, %f4227, %f4227; neg.f32 %f4229, %f4225; fma.rn.f32 %f4230, %f4229, %f4222, %f4228; mul.rn.f32 %f4231, %f4212, %f4230; fma.rn.f32 %f4234, %f4162, %f4226, %f4161; fma.rn.f32 %f4236, %f4234, %f4226, %f4164; fma.rn.f32 %f4238, %f4236, %f4226, %f4166; mul.rn.f32 %f4239, %f4238, %f4226; fma.rn.f32 %f4241, %f4225, %f4169, %f4221; sub.f32 %f4242, %f4221, %f4241; fma.rn.f32 %f4243, %f4225, %f4169, %f4242; fma.rn.f32 %f4244, %f4231, %f4169, %f4243; fma.rn.f32 %f4246, %f4225, %f4174, %f4244; mul.f32 %f4247, %f4239, 0f40400000; fma.rn.f32 %f4248, %f4247, %f4231, %f4246; fma.rn.f32 %f4249, %f4239, %f4225, %f4248; add.rn.f32 %f4250, %f4241, %f4249; neg.f32 %f4251, %f4241; add.rn.f32 %f4252, %f4250, %f4251; neg.f32 %f4253, %f4252; add.rn.f32 %f4254, %f4249, %f4253; mul.rn.f32 %f4255, %f4250, %f4153; neg.f32 %f4256, %f4255; fma.rn.f32 %f4257, %f4250, %f4153, %f4256; fma.rn.f32 %f4258, %f4254, %f4153, %f4257; cvt.rni.f32.f32 %f4259, %f4255; sub.f32 %f4260, %f4255, %f4259; add.f32 %f4261, %f4258, %f4260; fma.rn.f32 %f4264, %f4192, %f4261, %f4191; fma.rn.f32 %f4266, %f4264, %f4261, %f4194; fma.rn.f32 %f4268, %f4266, %f4261, %f4196; fma.rn.f32 %f4270, %f4268, %f4261, %f4198; fma.rn.f32 %f4272, %f4270, %f4261, %f4200; fma.rn.f32 %f4273, %f4272, %f4261, %f4939; cvt.rzi.s32.f32 %r1508, %f4259; setp.gt.f32 %p635, %f4259, 0f00000000; selp.b32 %r1509, 0, -2097152000, %p635; add.s32 %r1510, %r1509, 2130706432; mov.b32 %f4274, %r1510; mul.f32 %f4275, %f4273, %f4274; shl.b32 %r1511, %r1508, 23; sub.s32 %r1512, %r1511, %r1509; mov.b32 %f4276, %r1512; mul.f32 %f4277, %f4275, %f4276; abs.f32 %f4278, %f4255; setp.gt.f32 %p636, %f4278, 0f43180000; setp.lt.f32 %p637, %f4255, 0f00000000; selp.f32 %f4279, 0f00000000, 0f7F800000, %p637; selp.f32 %f764, %f4279, %f4277, %p636; setp.eq.f32 %p638, %f762, 0f3F800000; @%p638 bra $L__BB0_462; setp.gtu.f32 %p639, %f763, 0f7F800000; @%p639 bra $L__BB0_461; bra.uni $L__BB0_457; $L__BB0_461: mov.f32 %f4282, 0f40000000; add.rn.f32 %f4939, %f762, %f4282; bra.uni $L__BB0_462; $L__BB0_457: setp.eq.f32 %p640, %f762, 0f00000000; setp.eq.f32 %p641, %f763, 0f7F800000; or.pred %p642, %p640, %p641; @%p642 bra $L__BB0_460; bra.uni $L__BB0_458; $L__BB0_460: setp.eq.f32 %p645, %f19, 0f3F800000; add.f32 %f4281, %f762, %f762; mov.b32 %r1513, %f4281; and.b32 %r1514, %r1513, 2147483647; selp.b32 %r1515, %r1513, %r1514, %p645; mov.b32 %f4939, %r1515; bra.uni $L__BB0_462; $L__BB0_458: setp.geu.f32 %p643, %f762, 0f00000000; mov.f32 %f4939, %f764; @%p643 bra $L__BB0_462; setp.eq.f32 %p644, %f19, 0f3F800000; neg.f32 %f4280, %f764; selp.f32 %f4939, %f4280, %f764, %p644; $L__BB0_462: mov.f32 %f4286, 0f3F400000; sub.f32 %f769, %f4286, %f4939; add.f32 %f770, %f753, 0fBF000000; abs.f32 %f771, %f770; setp.lt.f32 %p646, %f771, 0f00800000; mul.f32 %f4287, %f771, 0f4B800000; selp.f32 %f4288, %f4287, %f771, %p646; selp.f32 %f4289, 0fC1C00000, 0f00000000, %p646; mov.b32 %r1516, %f4288; add.s32 %r1517, %r1516, -1060439283; and.b32 %r1518, %r1517, -8388608; sub.s32 %r1519, %r1516, %r1518; mov.b32 %f4290, %r1519; cvt.rn.f32.s32 %f4291, %r1518; mov.f32 %f4292, 0f34000000; fma.rn.f32 %f4293, %f4291, %f4292, %f4289; add.f32 %f4294, %f4290, 0fBF800000; add.f32 %f4284, %f4290, 0f3F800000; mov.f32 %f4941, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f4283,%f4284; // end inline asm add.f32 %f4295, %f4294, %f4294; mov.f32 %f4296, 0f40000000; mul.f32 %f4297, %f4283, %f4295; mul.f32 %f4298, %f4297, %f4297; sub.f32 %f4299, %f4294, %f4297; add.f32 %f4300, %f4299, %f4299; neg.f32 %f4301, %f4297; fma.rn.f32 %f4302, %f4301, %f4294, %f4300; mul.rn.f32 %f4303, %f4283, %f4302; mov.f32 %f4304, 0f3B52E7DB; mov.f32 %f4305, 0f3A2C32E4; fma.rn.f32 %f4306, %f4305, %f4298, %f4304; mov.f32 %f4307, 0f3C93BB73; fma.rn.f32 %f4308, %f4306, %f4298, %f4307; mov.f32 %f4309, 0f3DF6384F; fma.rn.f32 %f4310, %f4308, %f4298, %f4309; mul.rn.f32 %f4311, %f4310, %f4298; mov.f32 %f4312, 0f3FB8AA3B; fma.rn.f32 %f4313, %f4297, %f4312, %f4293; sub.f32 %f4314, %f4293, %f4313; fma.rn.f32 %f4315, %f4297, %f4312, %f4314; fma.rn.f32 %f4316, %f4303, %f4312, %f4315; mov.f32 %f4317, 0f32A55E34; fma.rn.f32 %f4318, %f4297, %f4317, %f4316; mul.f32 %f4319, %f4311, 0f40400000; fma.rn.f32 %f4320, %f4319, %f4303, %f4318; fma.rn.f32 %f4321, %f4311, %f4297, %f4320; add.rn.f32 %f4322, %f4313, %f4321; neg.f32 %f4323, %f4313; add.rn.f32 %f4324, %f4322, %f4323; neg.f32 %f4325, %f4324; add.rn.f32 %f4326, %f4321, %f4325; mul.rn.f32 %f4327, %f4322, %f4296; neg.f32 %f4328, %f4327; fma.rn.f32 %f4329, %f4322, %f4296, %f4328; fma.rn.f32 %f4330, %f4326, %f4296, %f4329; cvt.rni.f32.f32 %f4331, %f4327; sub.f32 %f4332, %f4327, %f4331; add.f32 %f4333, %f4330, %f4332; mov.f32 %f4334, 0f3AAF85ED; mov.f32 %f4335, 0f391FCB8E; fma.rn.f32 %f4336, %f4335, %f4333, %f4334; mov.f32 %f4337, 0f3C1D9856; fma.rn.f32 %f4338, %f4336, %f4333, %f4337; mov.f32 %f4339, 0f3D6357BB; fma.rn.f32 %f4340, %f4338, %f4333, %f4339; mov.f32 %f4341, 0f3E75FDEC; fma.rn.f32 %f4342, %f4340, %f4333, %f4341; mov.f32 %f4343, 0f3F317218; fma.rn.f32 %f4344, %f4342, %f4333, %f4343; fma.rn.f32 %f4345, %f4344, %f4333, %f4941; cvt.rzi.s32.f32 %r1520, %f4331; setp.gt.f32 %p647, %f4331, 0f00000000; selp.b32 %r1521, 0, -2097152000, %p647; add.s32 %r1522, %r1521, 2130706432; mov.b32 %f4346, %r1522; mul.f32 %f4347, %f4345, %f4346; shl.b32 %r1523, %r1520, 23; sub.s32 %r1524, %r1523, %r1521; mov.b32 %f4348, %r1524; mul.f32 %f4349, %f4347, %f4348; abs.f32 %f4350, %f4327; setp.gt.f32 %p648, %f4350, 0f43180000; setp.lt.f32 %p649, %f4327, 0f00000000; selp.f32 %f4351, 0f00000000, 0f7F800000, %p649; selp.f32 %f772, %f4351, %f4349, %p648; setp.eq.f32 %p650, %f770, 0f3F800000; mov.f32 %f4940, %f4941; @%p650 bra $L__BB0_469; setp.gtu.f32 %p651, %f771, 0f7F800000; @%p651 bra $L__BB0_468; bra.uni $L__BB0_464; $L__BB0_468: mov.f32 %f4354, 0f40000000; add.rn.f32 %f4940, %f770, %f4354; bra.uni $L__BB0_469; $L__BB0_464: setp.eq.f32 %p652, %f770, 0f00000000; setp.eq.f32 %p653, %f771, 0f7F800000; or.pred %p654, %p652, %p653; @%p654 bra $L__BB0_467; bra.uni $L__BB0_465; $L__BB0_467: setp.eq.f32 %p657, %f19, 0f3F800000; add.f32 %f4353, %f770, %f770; mov.b32 %r1525, %f4353; and.b32 %r1526, %r1525, 2147483647; selp.b32 %r1527, %r1525, %r1526, %p657; mov.b32 %f4940, %r1527; bra.uni $L__BB0_469; $L__BB0_465: setp.geu.f32 %p655, %f770, 0f00000000; mov.f32 %f4940, %f772; @%p655 bra $L__BB0_469; setp.eq.f32 %p656, %f19, 0f3F800000; neg.f32 %f4352, %f772; selp.f32 %f4940, %f4352, %f772, %p656; $L__BB0_469: ld.param.f32 %f4781, [g2p2g_param_11]; mul.f32 %f777, %f4940, 0f3F000000; neg.f32 %f4358, %f752; div.rn.f32 %f778, %f4358, %f4781; mov.f32 %f4359, 0f3FC00000; sub.f32 %f779, %f4359, %f778; abs.f32 %f780, %f779; setp.lt.f32 %p658, %f780, 0f00800000; mul.f32 %f4360, %f780, 0f4B800000; selp.f32 %f4361, %f4360, %f780, %p658; selp.f32 %f4362, 0fC1C00000, 0f00000000, %p658; mov.b32 %r1528, %f4361; add.s32 %r1529, %r1528, -1060439283; and.b32 %r1530, %r1529, -8388608; sub.s32 %r1531, %r1528, %r1530; mov.b32 %f4363, %r1531; cvt.rn.f32.s32 %f4364, %r1530; fma.rn.f32 %f4366, %f4364, %f4292, %f4362; add.f32 %f4367, %f4363, 0fBF800000; add.f32 %f4356, %f4363, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f4355,%f4356; // end inline asm add.f32 %f4368, %f4367, %f4367; mul.f32 %f4370, %f4355, %f4368; mul.f32 %f4371, %f4370, %f4370; sub.f32 %f4372, %f4367, %f4370; add.f32 %f4373, %f4372, %f4372; neg.f32 %f4374, %f4370; fma.rn.f32 %f4375, %f4374, %f4367, %f4373; mul.rn.f32 %f4376, %f4355, %f4375; fma.rn.f32 %f4379, %f4305, %f4371, %f4304; fma.rn.f32 %f4381, %f4379, %f4371, %f4307; fma.rn.f32 %f4383, %f4381, %f4371, %f4309; mul.rn.f32 %f4384, %f4383, %f4371; fma.rn.f32 %f4386, %f4370, %f4312, %f4366; sub.f32 %f4387, %f4366, %f4386; fma.rn.f32 %f4388, %f4370, %f4312, %f4387; fma.rn.f32 %f4389, %f4376, %f4312, %f4388; fma.rn.f32 %f4391, %f4370, %f4317, %f4389; mul.f32 %f4392, %f4384, 0f40400000; fma.rn.f32 %f4393, %f4392, %f4376, %f4391; fma.rn.f32 %f4394, %f4384, %f4370, %f4393; add.rn.f32 %f4395, %f4386, %f4394; neg.f32 %f4396, %f4386; add.rn.f32 %f4397, %f4395, %f4396; neg.f32 %f4398, %f4397; add.rn.f32 %f4399, %f4394, %f4398; mul.rn.f32 %f4400, %f4395, %f4296; neg.f32 %f4401, %f4400; fma.rn.f32 %f4402, %f4395, %f4296, %f4401; fma.rn.f32 %f4403, %f4399, %f4296, %f4402; cvt.rni.f32.f32 %f4404, %f4400; sub.f32 %f4405, %f4400, %f4404; add.f32 %f4406, %f4403, %f4405; fma.rn.f32 %f4409, %f4335, %f4406, %f4334; fma.rn.f32 %f4411, %f4409, %f4406, %f4337; fma.rn.f32 %f4413, %f4411, %f4406, %f4339; fma.rn.f32 %f4415, %f4413, %f4406, %f4341; fma.rn.f32 %f4417, %f4415, %f4406, %f4343; fma.rn.f32 %f4418, %f4417, %f4406, %f4941; cvt.rzi.s32.f32 %r1532, %f4404; setp.gt.f32 %p659, %f4404, 0f00000000; selp.b32 %r1533, 0, -2097152000, %p659; add.s32 %r1534, %r1533, 2130706432; mov.b32 %f4419, %r1534; mul.f32 %f4420, %f4418, %f4419; shl.b32 %r1535, %r1532, 23; sub.s32 %r1536, %r1535, %r1533; mov.b32 %f4421, %r1536; mul.f32 %f4422, %f4420, %f4421; abs.f32 %f4423, %f4400; setp.gt.f32 %p660, %f4423, 0f43180000; setp.lt.f32 %p661, %f4400, 0f00000000; selp.f32 %f4424, 0f00000000, 0f7F800000, %p661; selp.f32 %f781, %f4424, %f4422, %p660; setp.eq.f32 %p662, %f779, 0f3F800000; @%p662 bra $L__BB0_476; setp.gtu.f32 %p663, %f780, 0f7F800000; @%p663 bra $L__BB0_475; bra.uni $L__BB0_471; $L__BB0_475: mov.f32 %f4427, 0f40000000; add.rn.f32 %f4941, %f779, %f4427; bra.uni $L__BB0_476; $L__BB0_471: setp.eq.f32 %p664, %f779, 0f00000000; setp.eq.f32 %p665, %f780, 0f7F800000; or.pred %p666, %p664, %p665; @%p666 bra $L__BB0_474; bra.uni $L__BB0_472; $L__BB0_474: setp.eq.f32 %p669, %f19, 0f3F800000; add.f32 %f4426, %f779, %f779; mov.b32 %r1537, %f4426; and.b32 %r1538, %r1537, 2147483647; selp.b32 %r1539, %r1537, %r1538, %p669; mov.b32 %f4941, %r1539; bra.uni $L__BB0_476; $L__BB0_472: setp.geu.f32 %p667, %f779, 0f00000000; mov.f32 %f4941, %f781; @%p667 bra $L__BB0_476; setp.eq.f32 %p668, %f19, 0f3F800000; neg.f32 %f4425, %f781; selp.f32 %f4941, %f4425, %f781, %p668; $L__BB0_476: mul.f32 %f786, %f4941, 0f3F000000; add.f32 %f787, %f778, 0fBF800000; abs.f32 %f788, %f787; setp.lt.f32 %p670, %f788, 0f00800000; mul.f32 %f4431, %f788, 0f4B800000; selp.f32 %f4432, %f4431, %f788, %p670; selp.f32 %f4433, 0fC1C00000, 0f00000000, %p670; mov.b32 %r1540, %f4432; add.s32 %r1541, %r1540, -1060439283; and.b32 %r1542, %r1541, -8388608; sub.s32 %r1543, %r1540, %r1542; mov.b32 %f4434, %r1543; cvt.rn.f32.s32 %f4435, %r1542; mov.f32 %f4436, 0f34000000; fma.rn.f32 %f4437, %f4435, %f4436, %f4433; add.f32 %f4438, %f4434, 0fBF800000; add.f32 %f4429, %f4434, 0f3F800000; mov.f32 %f4943, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f4428,%f4429; // end inline asm add.f32 %f4439, %f4438, %f4438; mov.f32 %f4440, 0f40000000; mul.f32 %f4441, %f4428, %f4439; mul.f32 %f4442, %f4441, %f4441; sub.f32 %f4443, %f4438, %f4441; add.f32 %f4444, %f4443, %f4443; neg.f32 %f4445, %f4441; fma.rn.f32 %f4446, %f4445, %f4438, %f4444; mul.rn.f32 %f4447, %f4428, %f4446; mov.f32 %f4448, 0f3B52E7DB; mov.f32 %f4449, 0f3A2C32E4; fma.rn.f32 %f4450, %f4449, %f4442, %f4448; mov.f32 %f4451, 0f3C93BB73; fma.rn.f32 %f4452, %f4450, %f4442, %f4451; mov.f32 %f4453, 0f3DF6384F; fma.rn.f32 %f4454, %f4452, %f4442, %f4453; mul.rn.f32 %f4455, %f4454, %f4442; mov.f32 %f4456, 0f3FB8AA3B; fma.rn.f32 %f4457, %f4441, %f4456, %f4437; sub.f32 %f4458, %f4437, %f4457; fma.rn.f32 %f4459, %f4441, %f4456, %f4458; fma.rn.f32 %f4460, %f4447, %f4456, %f4459; mov.f32 %f4461, 0f32A55E34; fma.rn.f32 %f4462, %f4441, %f4461, %f4460; mul.f32 %f4463, %f4455, 0f40400000; fma.rn.f32 %f4464, %f4463, %f4447, %f4462; fma.rn.f32 %f4465, %f4455, %f4441, %f4464; add.rn.f32 %f4466, %f4457, %f4465; neg.f32 %f4467, %f4457; add.rn.f32 %f4468, %f4466, %f4467; neg.f32 %f4469, %f4468; add.rn.f32 %f4470, %f4465, %f4469; mul.rn.f32 %f4471, %f4466, %f4440; neg.f32 %f4472, %f4471; fma.rn.f32 %f4473, %f4466, %f4440, %f4472; fma.rn.f32 %f4474, %f4470, %f4440, %f4473; cvt.rni.f32.f32 %f4475, %f4471; sub.f32 %f4476, %f4471, %f4475; add.f32 %f4477, %f4474, %f4476; mov.f32 %f4478, 0f3AAF85ED; mov.f32 %f4479, 0f391FCB8E; fma.rn.f32 %f4480, %f4479, %f4477, %f4478; mov.f32 %f4481, 0f3C1D9856; fma.rn.f32 %f4482, %f4480, %f4477, %f4481; mov.f32 %f4483, 0f3D6357BB; fma.rn.f32 %f4484, %f4482, %f4477, %f4483; mov.f32 %f4485, 0f3E75FDEC; fma.rn.f32 %f4486, %f4484, %f4477, %f4485; mov.f32 %f4487, 0f3F317218; fma.rn.f32 %f4488, %f4486, %f4477, %f4487; fma.rn.f32 %f4489, %f4488, %f4477, %f4943; cvt.rzi.s32.f32 %r1544, %f4475; setp.gt.f32 %p671, %f4475, 0f00000000; selp.b32 %r1545, 0, -2097152000, %p671; add.s32 %r1546, %r1545, 2130706432; mov.b32 %f4490, %r1546; mul.f32 %f4491, %f4489, %f4490; shl.b32 %r1547, %r1544, 23; sub.s32 %r1548, %r1547, %r1545; mov.b32 %f4492, %r1548; mul.f32 %f4493, %f4491, %f4492; abs.f32 %f4494, %f4471; setp.gt.f32 %p672, %f4494, 0f43180000; setp.lt.f32 %p673, %f4471, 0f00000000; selp.f32 %f4495, 0f00000000, 0f7F800000, %p673; selp.f32 %f789, %f4495, %f4493, %p672; setp.eq.f32 %p674, %f787, 0f3F800000; mov.f32 %f4942, %f4943; @%p674 bra $L__BB0_483; setp.gtu.f32 %p675, %f788, 0f7F800000; @%p675 bra $L__BB0_482; bra.uni $L__BB0_478; $L__BB0_482: mov.f32 %f4498, 0f40000000; add.rn.f32 %f4942, %f787, %f4498; bra.uni $L__BB0_483; $L__BB0_478: setp.eq.f32 %p676, %f787, 0f00000000; setp.eq.f32 %p677, %f788, 0f7F800000; or.pred %p678, %p676, %p677; @%p678 bra $L__BB0_481; bra.uni $L__BB0_479; $L__BB0_481: setp.eq.f32 %p681, %f19, 0f3F800000; add.f32 %f4497, %f787, %f787; mov.b32 %r1549, %f4497; and.b32 %r1550, %r1549, 2147483647; selp.b32 %r1551, %r1549, %r1550, %p681; mov.b32 %f4942, %r1551; bra.uni $L__BB0_483; $L__BB0_479: setp.geu.f32 %p679, %f787, 0f00000000; mov.f32 %f4942, %f789; @%p679 bra $L__BB0_483; setp.eq.f32 %p680, %f19, 0f3F800000; neg.f32 %f4496, %f789; selp.f32 %f4942, %f4496, %f789, %p680; $L__BB0_483: mov.f32 %f4502, 0f3F400000; sub.f32 %f794, %f4502, %f4942; add.f32 %f795, %f778, 0fBF000000; abs.f32 %f796, %f795; setp.lt.f32 %p682, %f796, 0f00800000; mul.f32 %f4503, %f796, 0f4B800000; selp.f32 %f4504, %f4503, %f796, %p682; selp.f32 %f4505, 0fC1C00000, 0f00000000, %p682; mov.b32 %r1552, %f4504; add.s32 %r1553, %r1552, -1060439283; and.b32 %r1554, %r1553, -8388608; sub.s32 %r1555, %r1552, %r1554; mov.b32 %f4506, %r1555; cvt.rn.f32.s32 %f4507, %r1554; fma.rn.f32 %f4509, %f4507, %f4436, %f4505; add.f32 %f4510, %f4506, 0fBF800000; add.f32 %f4500, %f4506, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f4499,%f4500; // end inline asm add.f32 %f4511, %f4510, %f4510; mul.f32 %f4513, %f4499, %f4511; mul.f32 %f4514, %f4513, %f4513; sub.f32 %f4515, %f4510, %f4513; add.f32 %f4516, %f4515, %f4515; neg.f32 %f4517, %f4513; fma.rn.f32 %f4518, %f4517, %f4510, %f4516; mul.rn.f32 %f4519, %f4499, %f4518; fma.rn.f32 %f4522, %f4449, %f4514, %f4448; fma.rn.f32 %f4524, %f4522, %f4514, %f4451; fma.rn.f32 %f4526, %f4524, %f4514, %f4453; mul.rn.f32 %f4527, %f4526, %f4514; fma.rn.f32 %f4529, %f4513, %f4456, %f4509; sub.f32 %f4530, %f4509, %f4529; fma.rn.f32 %f4531, %f4513, %f4456, %f4530; fma.rn.f32 %f4532, %f4519, %f4456, %f4531; fma.rn.f32 %f4534, %f4513, %f4461, %f4532; mul.f32 %f4535, %f4527, 0f40400000; fma.rn.f32 %f4536, %f4535, %f4519, %f4534; fma.rn.f32 %f4537, %f4527, %f4513, %f4536; add.rn.f32 %f4538, %f4529, %f4537; neg.f32 %f4539, %f4529; add.rn.f32 %f4540, %f4538, %f4539; neg.f32 %f4541, %f4540; add.rn.f32 %f4542, %f4537, %f4541; mul.rn.f32 %f4543, %f4538, %f4440; neg.f32 %f4544, %f4543; fma.rn.f32 %f4545, %f4538, %f4440, %f4544; fma.rn.f32 %f4546, %f4542, %f4440, %f4545; cvt.rni.f32.f32 %f4547, %f4543; sub.f32 %f4548, %f4543, %f4547; add.f32 %f4549, %f4546, %f4548; fma.rn.f32 %f4552, %f4479, %f4549, %f4478; fma.rn.f32 %f4554, %f4552, %f4549, %f4481; fma.rn.f32 %f4556, %f4554, %f4549, %f4483; fma.rn.f32 %f4558, %f4556, %f4549, %f4485; fma.rn.f32 %f4560, %f4558, %f4549, %f4487; fma.rn.f32 %f4561, %f4560, %f4549, %f4943; cvt.rzi.s32.f32 %r1556, %f4547; setp.gt.f32 %p683, %f4547, 0f00000000; selp.b32 %r1557, 0, -2097152000, %p683; add.s32 %r1558, %r1557, 2130706432; mov.b32 %f4562, %r1558; mul.f32 %f4563, %f4561, %f4562; shl.b32 %r1559, %r1556, 23; sub.s32 %r1560, %r1559, %r1557; mov.b32 %f4564, %r1560; mul.f32 %f4565, %f4563, %f4564; abs.f32 %f4566, %f4543; setp.gt.f32 %p684, %f4566, 0f43180000; setp.lt.f32 %p685, %f4543, 0f00000000; selp.f32 %f4567, 0f00000000, 0f7F800000, %p685; selp.f32 %f797, %f4567, %f4565, %p684; setp.eq.f32 %p686, %f795, 0f3F800000; @%p686 bra $L__BB0_490; setp.gtu.f32 %p687, %f796, 0f7F800000; @%p687 bra $L__BB0_489; bra.uni $L__BB0_485; $L__BB0_489: mov.f32 %f4570, 0f40000000; add.rn.f32 %f4943, %f795, %f4570; bra.uni $L__BB0_490; $L__BB0_485: setp.eq.f32 %p688, %f795, 0f00000000; setp.eq.f32 %p689, %f796, 0f7F800000; or.pred %p690, %p688, %p689; @%p690 bra $L__BB0_488; bra.uni $L__BB0_486; $L__BB0_488: setp.eq.f32 %p693, %f19, 0f3F800000; add.f32 %f4569, %f795, %f795; mov.b32 %r1561, %f4569; and.b32 %r1562, %r1561, 2147483647; selp.b32 %r1563, %r1561, %r1562, %p693; mov.b32 %f4943, %r1563; bra.uni $L__BB0_490; $L__BB0_486: setp.geu.f32 %p691, %f795, 0f00000000; mov.f32 %f4943, %f797; @%p691 bra $L__BB0_490; setp.eq.f32 %p692, %f19, 0f3F800000; neg.f32 %f4568, %f797; selp.f32 %f4943, %f4568, %f797, %p692; $L__BB0_490: mov.b32 %f4811, %r8; mov.u64 %rd1104, _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E; ld.param.f32 %f4784, [g2p2g_param_11]; add.f32 %f4783, %f4784, %f4784; ld.param.f32 %f4782, [g2p2g_param_0]; add.u64 %rd1102, %SPL, 96; mul.f32 %f802, %f4943, 0f3F000000; mul.f32 %f4571, %f4, %f4866; mul.f32 %f4572, %f4, %f4867; mul.f32 %f4573, %f4, %f4868; mul.f32 %f4574, %f4, %f4869; mul.f32 %f4575, %f12, %f5; mul.f32 %f4576, %f4575, %f4782; mul.f32 %f4577, %f4576, %f4934; mul.f32 %f4578, %f4576, %f4935; mul.f32 %f4579, %f4576, %f4936; mul.f32 %f4580, %f4576, %f4937; sub.f32 %f803, %f4571, %f4577; sub.f32 %f804, %f4572, %f4578; sub.f32 %f805, %f4573, %f4579; sub.f32 %f806, %f4574, %f4580; ld.local.u64 %rd837, [%rd1102]; cvt.u32.u64 %r1564, %rd837; mov.b32 %f4581, %r1564; mul.f32 %f4582, %f4, %f4581; shr.u64 %rd838, %rd837, 32; cvt.u32.u64 %r1565, %rd838; mov.b32 %f4583, %r1565; mul.f32 %f4584, %f4, %f4583; fma.rn.f32 %f807, %f4782, 0f00000000, %f4582; fma.rn.f32 %f808, %f4782, 0f00000000, %f4584; setp.gt.f32 %p694, %f4811, 0f00000000; selp.f32 %f809, %f4, 0f00000000, %p694; mul.f32 %f810, %f413, %f809; sub.f32 %f4585, %f749, %f13; setp.gt.f32 %p695, %f4585, 0f5EFFFFFF; max.f32 %f4586, %f4585, 0fDF000000; cvt.rzi.s64.f32 %rd839, %f4586; selp.b64 %rd840, 4294967295, %rd839, %p695; setp.num.f32 %p696, %f4585, %f4585; selp.b64 %rd841, %rd840, 0, %p696; sub.f32 %f4587, %f750, %f14; setp.gt.f32 %p697, %f4587, 0f5EFFFFFF; max.f32 %f4588, %f4587, 0fDF000000; cvt.rzi.s64.f32 %rd842, %f4588; setp.num.f32 %p698, %f4587, %f4587; add.s64 %rd843, %rd841, %rd80; shl.b64 %rd844, %rd842, 3; selp.b64 %rd845, 4294967288, %rd844, %p697; selp.b64 %rd846, %rd845, 0, %p698; add.s64 %rd847, %rd843, %rd846; and.b64 %rd266, %rd847, 4294967295; add.f32 %f811, %f751, %f4783; mul.f32 %f812, %f803, %f811; add.f32 %f813, %f752, %f4783; mul.f32 %f814, %f805, %f813; add.f32 %f4589, %f812, %f814; add.f32 %f815, %f807, %f4589; shl.b64 %rd848, %rd847, 6; and.b64 %rd849, %rd848, 274877906880; cvta.shared.u64 %rd851, %rd1104; add.s64 %rd852, %rd851, %rd849; add.s64 %rd267, %rd852, 1212; mov.b32 %r1706, %f413; $L__BB0_491: // begin inline asm cvta.to.shared.u64 %rd853, %rd267;atom.acquire.shared.exch.b32 %r1566, [%rd853], %r1; // end inline asm setp.ne.s32 %p699, %r1566, -1; @%p699 bra $L__BB0_491; ld.param.f32 %f4786, [g2p2g_param_11]; mul.f32 %f4785, %f4786, 0f00000000; mov.u64 %rd1105, _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E; mul.f32 %f4590, %f777, %f802; mul.f32 %f816, %f804, %f811; mul.f32 %f817, %f806, %f813; add.f32 %f4591, %f816, %f817; add.f32 %f4592, %f808, %f4591; shl.b64 %rd857, %rd266, 6; add.s64 %rd859, %rd1105, %rd857; add.s64 %rd268, %rd859, 1172; ld.shared.f32 %f4593, [%rd859+1172]; fma.rn.f32 %f4594, %f4, %f4590, %f4593; st.shared.f32 [%rd859+1172], %f4594; ld.shared.v2.f32 {%f4595, %f4596}, [%rd859+1176]; fma.rn.f32 %f4599, %f815, %f4590, %f4595; st.shared.f32 [%rd859+1176], %f4599; fma.rn.f32 %f4600, %f4592, %f4590, %f4596; st.shared.f32 [%rd859+1180], %f4600; ld.shared.v2.f32 {%f4601, %f4602}, [%rd859+1192]; fma.rn.f32 %f4605, %f810, %f4590, %f4602; fma.rn.f32 %f4606, %f809, %f4590, %f4601; st.shared.v2.f32 [%rd859+1192], {%f4606, %f4605}; mov.u32 %r1569, -1; // begin inline asm cvta.to.shared.u64 %rd855, %rd267;atom.release.shared.exch.b32 %r1568, [%rd855], %r1569; // end inline asm add.f32 %f818, %f752, %f4785; mul.f32 %f819, %f805, %f818; add.f32 %f4607, %f812, %f819; add.f32 %f820, %f807, %f4607; add.s64 %rd861, %rd851, %rd857; add.s64 %rd269, %rd861, 188; $L__BB0_493: // begin inline asm cvta.to.shared.u64 %rd862, %rd269;atom.acquire.shared.exch.b32 %r1570, [%rd862], %r1; // end inline asm setp.ne.s32 %p700, %r1570, -1; @%p700 bra $L__BB0_493; ld.param.f32 %f4787, [g2p2g_param_11]; mul.f32 %f4608, %f777, %f786; mul.f32 %f821, %f806, %f818; add.f32 %f4609, %f816, %f821; add.f32 %f4610, %f808, %f4609; ld.shared.f32 %f4611, [%rd268+-1024]; fma.rn.f32 %f4612, %f4, %f4608, %f4611; st.shared.f32 [%rd268+-1024], %f4612; ld.shared.v2.f32 {%f4613, %f4614}, [%rd268+-1020]; fma.rn.f32 %f4617, %f820, %f4608, %f4613; st.shared.f32 [%rd268+-1020], %f4617; fma.rn.f32 %f4618, %f4610, %f4608, %f4614; st.shared.f32 [%rd268+-1016], %f4618; ld.shared.v2.f32 {%f4619, %f4620}, [%rd268+-1004]; fma.rn.f32 %f4623, %f810, %f4608, %f4620; fma.rn.f32 %f4624, %f809, %f4608, %f4619; st.shared.v2.f32 [%rd268+-1004], {%f4624, %f4623}; mov.u32 %r1573, -1; // begin inline asm cvta.to.shared.u64 %rd864, %rd269;atom.release.shared.exch.b32 %r1572, [%rd864], %r1573; // end inline asm add.f32 %f822, %f752, %f4787; mul.f32 %f823, %f805, %f822; add.f32 %f4625, %f812, %f823; add.f32 %f824, %f807, %f4625; add.s64 %rd270, %rd861, 700; $L__BB0_495: // begin inline asm cvta.to.shared.u64 %rd870, %rd270;atom.acquire.shared.exch.b32 %r1574, [%rd870], %r1; // end inline asm setp.ne.s32 %p701, %r1574, -1; @%p701 bra $L__BB0_495; ld.param.f32 %f4789, [g2p2g_param_11]; mul.f32 %f4788, %f4789, 0f00000000; mul.f32 %f4626, %f777, %f794; mul.f32 %f825, %f806, %f822; add.f32 %f4627, %f816, %f825; add.f32 %f4628, %f808, %f4627; ld.shared.f32 %f4629, [%rd268+-512]; fma.rn.f32 %f4630, %f4, %f4626, %f4629; st.shared.f32 [%rd268+-512], %f4630; ld.shared.v2.f32 {%f4631, %f4632}, [%rd268+-508]; fma.rn.f32 %f4635, %f824, %f4626, %f4631; st.shared.f32 [%rd268+-508], %f4635; fma.rn.f32 %f4636, %f4628, %f4626, %f4632; st.shared.f32 [%rd268+-504], %f4636; ld.shared.v2.f32 {%f4637, %f4638}, [%rd268+-492]; fma.rn.f32 %f4641, %f810, %f4626, %f4638; fma.rn.f32 %f4642, %f809, %f4626, %f4637; st.shared.v2.f32 [%rd268+-492], {%f4642, %f4641}; mov.u32 %r1577, -1; // begin inline asm cvta.to.shared.u64 %rd872, %rd270;atom.release.shared.exch.b32 %r1576, [%rd872], %r1577; // end inline asm add.f32 %f826, %f751, %f4788; mul.f32 %f827, %f803, %f826; add.f32 %f4643, %f827, %f814; add.f32 %f828, %f807, %f4643; add.s64 %rd271, %rd861, 1084; $L__BB0_497: // begin inline asm cvta.to.shared.u64 %rd878, %rd271;atom.acquire.shared.exch.b32 %r1578, [%rd878], %r1; // end inline asm setp.ne.s32 %p702, %r1578, -1; @%p702 bra $L__BB0_497; mul.f32 %f4644, %f761, %f802; mul.f32 %f829, %f804, %f826; add.f32 %f4645, %f829, %f817; add.f32 %f4646, %f808, %f4645; ld.shared.f32 %f4647, [%rd268+-128]; fma.rn.f32 %f4648, %f4, %f4644, %f4647; st.shared.f32 [%rd268+-128], %f4648; ld.shared.v2.f32 {%f4649, %f4650}, [%rd268+-124]; fma.rn.f32 %f4653, %f828, %f4644, %f4649; st.shared.f32 [%rd268+-124], %f4653; fma.rn.f32 %f4654, %f4646, %f4644, %f4650; st.shared.f32 [%rd268+-120], %f4654; ld.shared.v2.f32 {%f4655, %f4656}, [%rd268+-108]; fma.rn.f32 %f4659, %f810, %f4644, %f4656; fma.rn.f32 %f4660, %f809, %f4644, %f4655; st.shared.v2.f32 [%rd268+-108], {%f4660, %f4659}; mov.u32 %r1581, -1; // begin inline asm cvta.to.shared.u64 %rd880, %rd271;atom.release.shared.exch.b32 %r1580, [%rd880], %r1581; // end inline asm add.f32 %f4661, %f827, %f819; add.f32 %f830, %f807, %f4661; add.s64 %rd272, %rd861, 60; $L__BB0_499: // begin inline asm cvta.to.shared.u64 %rd886, %rd272;atom.acquire.shared.exch.b32 %r1582, [%rd886], %r1; // end inline asm setp.ne.s32 %p703, %r1582, -1; @%p703 bra $L__BB0_499; mul.f32 %f4662, %f761, %f786; add.f32 %f4663, %f829, %f821; add.f32 %f4664, %f808, %f4663; ld.shared.f32 %f4665, [%rd268+-1152]; fma.rn.f32 %f4666, %f4, %f4662, %f4665; st.shared.f32 [%rd268+-1152], %f4666; ld.shared.v2.f32 {%f4667, %f4668}, [%rd268+-1148]; fma.rn.f32 %f4671, %f830, %f4662, %f4667; st.shared.f32 [%rd268+-1148], %f4671; fma.rn.f32 %f4672, %f4664, %f4662, %f4668; st.shared.f32 [%rd268+-1144], %f4672; ld.shared.v2.f32 {%f4673, %f4674}, [%rd268+-1132]; fma.rn.f32 %f4677, %f810, %f4662, %f4674; fma.rn.f32 %f4678, %f809, %f4662, %f4673; st.shared.v2.f32 [%rd268+-1132], {%f4678, %f4677}; mov.u32 %r1585, -1; // begin inline asm cvta.to.shared.u64 %rd888, %rd272;atom.release.shared.exch.b32 %r1584, [%rd888], %r1585; // end inline asm add.f32 %f4679, %f827, %f823; add.f32 %f831, %f807, %f4679; add.s64 %rd273, %rd272, 512; $L__BB0_501: // begin inline asm cvta.to.shared.u64 %rd890, %rd273;atom.acquire.shared.exch.b32 %r1586, [%rd890], %r1; // end inline asm setp.ne.s32 %p704, %r1586, -1; @%p704 bra $L__BB0_501; ld.param.f32 %f4790, [g2p2g_param_11]; mul.f32 %f4680, %f761, %f794; add.f32 %f4681, %f829, %f825; add.f32 %f4682, %f808, %f4681; ld.shared.f32 %f4683, [%rd268+-640]; fma.rn.f32 %f4684, %f4, %f4680, %f4683; st.shared.f32 [%rd268+-640], %f4684; ld.shared.v2.f32 {%f4685, %f4686}, [%rd268+-636]; fma.rn.f32 %f4689, %f831, %f4680, %f4685; st.shared.f32 [%rd268+-636], %f4689; fma.rn.f32 %f4690, %f4682, %f4680, %f4686; st.shared.f32 [%rd268+-632], %f4690; ld.shared.v2.f32 {%f4691, %f4692}, [%rd268+-620]; fma.rn.f32 %f4695, %f810, %f4680, %f4692; fma.rn.f32 %f4696, %f809, %f4680, %f4691; st.shared.v2.f32 [%rd268+-620], {%f4696, %f4695}; mov.u32 %r1589, -1; // begin inline asm cvta.to.shared.u64 %rd892, %rd273;atom.release.shared.exch.b32 %r1588, [%rd892], %r1589; // end inline asm add.f32 %f832, %f751, %f4790; mul.f32 %f833, %f803, %f832; add.f32 %f4697, %f833, %f814; add.f32 %f834, %f807, %f4697; add.s64 %rd274, %rd272, 1088; $L__BB0_503: // begin inline asm cvta.to.shared.u64 %rd894, %rd274;atom.acquire.shared.exch.b32 %r1590, [%rd894], %r1; // end inline asm setp.ne.s32 %p705, %r1590, -1; @%p705 bra $L__BB0_503; mul.f32 %f4698, %f769, %f802; mul.f32 %f835, %f804, %f832; add.f32 %f4699, %f835, %f817; add.f32 %f4700, %f808, %f4699; ld.shared.f32 %f4701, [%rd268+-64]; fma.rn.f32 %f4702, %f4, %f4698, %f4701; st.shared.f32 [%rd268+-64], %f4702; ld.shared.v2.f32 {%f4703, %f4704}, [%rd268+-60]; fma.rn.f32 %f4707, %f834, %f4698, %f4703; st.shared.f32 [%rd268+-60], %f4707; fma.rn.f32 %f4708, %f4700, %f4698, %f4704; st.shared.f32 [%rd268+-56], %f4708; ld.shared.v2.f32 {%f4709, %f4710}, [%rd268+-44]; fma.rn.f32 %f4713, %f810, %f4698, %f4710; fma.rn.f32 %f4714, %f809, %f4698, %f4709; st.shared.v2.f32 [%rd268+-44], {%f4714, %f4713}; mov.u32 %r1593, -1; // begin inline asm cvta.to.shared.u64 %rd896, %rd274;atom.release.shared.exch.b32 %r1592, [%rd896], %r1593; // end inline asm add.f32 %f4715, %f833, %f819; add.f32 %f836, %f807, %f4715; add.s64 %rd275, %rd272, 64; $L__BB0_505: // begin inline asm cvta.to.shared.u64 %rd898, %rd275;atom.acquire.shared.exch.b32 %r1594, [%rd898], %r1; // end inline asm setp.ne.s32 %p706, %r1594, -1; @%p706 bra $L__BB0_505; mul.f32 %f4716, %f769, %f786; add.f32 %f4717, %f835, %f821; add.f32 %f4718, %f808, %f4717; ld.shared.f32 %f4719, [%rd268+-1088]; fma.rn.f32 %f4720, %f4, %f4716, %f4719; st.shared.f32 [%rd268+-1088], %f4720; ld.shared.v2.f32 {%f4721, %f4722}, [%rd268+-1084]; fma.rn.f32 %f4725, %f836, %f4716, %f4721; st.shared.f32 [%rd268+-1084], %f4725; fma.rn.f32 %f4726, %f4718, %f4716, %f4722; st.shared.f32 [%rd268+-1080], %f4726; ld.shared.v2.f32 {%f4727, %f4728}, [%rd268+-1068]; fma.rn.f32 %f4731, %f810, %f4716, %f4728; fma.rn.f32 %f4732, %f809, %f4716, %f4727; st.shared.v2.f32 [%rd268+-1068], {%f4732, %f4731}; mov.u32 %r1597, -1; // begin inline asm cvta.to.shared.u64 %rd900, %rd275;atom.release.shared.exch.b32 %r1596, [%rd900], %r1597; // end inline asm add.f32 %f4733, %f833, %f823; add.f32 %f837, %f807, %f4733; add.s64 %rd276, %rd272, 576; $L__BB0_507: // begin inline asm cvta.to.shared.u64 %rd902, %rd276;atom.acquire.shared.exch.b32 %r1598, [%rd902], %r1; // end inline asm setp.ne.s32 %p707, %r1598, -1; @%p707 bra $L__BB0_507; mul.f32 %f4734, %f769, %f794; add.f32 %f4735, %f835, %f825; add.f32 %f4736, %f808, %f4735; ld.shared.f32 %f4737, [%rd268+-576]; fma.rn.f32 %f4738, %f4, %f4734, %f4737; st.shared.f32 [%rd268+-576], %f4738; ld.shared.v2.f32 {%f4739, %f4740}, [%rd268+-572]; fma.rn.f32 %f4743, %f837, %f4734, %f4739; st.shared.f32 [%rd268+-572], %f4743; fma.rn.f32 %f4744, %f4736, %f4734, %f4740; st.shared.f32 [%rd268+-568], %f4744; ld.shared.v2.f32 {%f4745, %f4746}, [%rd268+-556]; fma.rn.f32 %f4749, %f810, %f4734, %f4746; fma.rn.f32 %f4750, %f809, %f4734, %f4745; st.shared.v2.f32 [%rd268+-556], {%f4750, %f4749}; mov.u32 %r1601, -1; // begin inline asm cvta.to.shared.u64 %rd904, %rd276;atom.release.shared.exch.b32 %r1600, [%rd904], %r1601; // end inline asm mov.u16 %rs37, 0; $L__BB0_510: add.u64 %rd1106, %SPL, 96; ld.param.u64 %rd1095, [g2p2g_param_7]; mul.wide.u32 %rd1094, %r332, 8; cvta.to.global.u64 %rd1093, %rd1095; add.s64 %rd1092, %rd1093, %rd1094; mul.wide.u32 %rd1063, %r332, 8; ld.param.u64 %rd1062, [g2p2g_param_6]; mul.wide.u32 %rd1061, %r332, 32; cvta.to.global.u64 %rd1060, %rd1062; add.s64 %rd1059, %rd1060, %rd1061; ld.param.u64 %rd1058, [g2p2g_param_5]; cvta.to.global.u64 %rd1057, %rd1058; add.s64 %rd1056, %rd1057, %rd1063; ld.param.u64 %rd1055, [g2p2g_param_4]; cvta.to.global.u64 %rd1054, %rd1055; add.s64 %rd1053, %rd1054, %rd1063; ld.param.u64 %rd1052, [g2p2g_param_3]; mul.wide.u32 %rd1051, %r332, 24; cvta.to.global.u64 %rd1050, %rd1052; add.s64 %rd1049, %rd1050, %rd1051; st.global.v4.u8 [%rd1049], {%rs37, %rs10, %rs11, %rs12}; st.global.u32 [%rd1049+4], %rd72; shr.u64 %rd911, %rd72, 32; st.global.u32 [%rd1049+8], %rd911; st.global.u32 [%rd1049+12], %r7; st.global.u64 [%rd1049+16], %rd73; st.global.f32 [%rd1053], %f79; st.global.f32 [%rd1053+4], %f80; ld.local.u64 %rd912, [%rd1106]; st.global.u32 [%rd1056], %rd912; shr.u64 %rd913, %rd912, 32; st.global.u32 [%rd1056+4], %rd913; st.global.f32 [%rd1059], %f4; st.global.f32 [%rd1059+4], %f5; st.global.f32 [%rd1059+8], %f6; st.global.f32 [%rd1059+12], %f4875; st.global.f32 [%rd1059+16], %f4864; st.global.f32 [%rd1059+20], %f4863; st.global.f32 [%rd1059+24], %f4944; st.global.f32 [%rd1059+28], %f4865; st.global.u32 [%rd1092], %r8; st.global.u32 [%rd1092+4], %r1706; $L__BB0_511: shr.u64 %rd1071, %rd17, 16; xor.b64 %rd1070, %rd1071, %rd17; mul.lo.s64 %rd1069, %rd1070, 2246822507; shr.u64 %rd1068, %rd1069, 13; xor.b64 %rd1067, %rd1068, %rd1069; mul.lo.s64 %rd1066, %rd1067, 3266489909; shr.u64 %rd1065, %rd1066, 16; xor.b64 %rd1064, %rd1065, %rd1066; ld.param.u32 %r1633, [g2p2g_param_11+40]; bar.sync 0; cvt.u64.u32 %rd914, %r1633; add.s64 %rd277, %rd914, -1; and.b64 %rd1195, %rd1064, %rd277; shl.b64 %rd915, %rd1195, 4; add.s64 %rd916, %rd11, %rd915; ld.global.u64 %rd279, [%rd916]; setp.eq.s64 %p708, %rd279, %rd17; @%p708 bra $L__BB0_517; setp.eq.s64 %p709, %rd279, -1; @%p709 bra $L__BB0_516; $L__BB0_514: add.s64 %rd917, %rd1195, 1; and.b64 %rd1195, %rd917, %rd277; shl.b64 %rd918, %rd1195, 4; add.s64 %rd919, %rd11, %rd918; ld.global.u64 %rd282, [%rd919]; setp.eq.s64 %p710, %rd282, %rd17; @%p710 bra $L__BB0_517; setp.ne.s64 %p711, %rd282, -1; @%p711 bra $L__BB0_514; $L__BB0_516: trap; $L__BB0_517: cvt.u64.u32 %rd1072, %r309; mov.u32 %r1634, %ntid.x; and.b64 %rd284, %rd14, 15; add.s64 %rd285, %rd284, %rd1072; setp.gt.u32 %p712, %r1634, 64; @%p712 bra $L__BB0_534; mul.wide.u32 %rd1077, %r309, %r1; shr.u64 %rd1076, %rd1077, 5; and.b64 %rd1075, %rd1076, 1; shr.u64 %rd1074, %rd1077, 4; and.b64 %rd1073, %rd1074, 1; shl.b64 %rd920, %rd1195, 4; add.s64 %rd921, %rd11, %rd920; shl.b64 %rd288, %rd1073, 2; shl.b64 %rd289, %rd1075, 2; ld.global.u32 %r1607, [%rd921+8]; mul.wide.u32 %rd290, %r1607, 16; add.s64 %rd922, %rd284, 1; max.u64 %rd291, %rd922, %rd285; sub.s64 %rd923, %rd291, %rd1077; and.b64 %rd1198, %rd923, 3; setp.eq.s64 %p713, %rd1198, 0; mov.u64 %rd1204, %rd284; @%p713 bra $L__BB0_523; mov.u64 %rd1197, %rd284; $L__BB0_520: .pragma "nounroll"; add.s64 %rd1204, %rd1197, 1; shr.u64 %rd924, %rd1197, 2; and.b64 %rd925, %rd924, 3; and.b64 %rd926, %rd1197, 3; or.b64 %rd927, %rd926, %rd288; or.b64 %rd928, %rd925, %rd289; shl.b64 %rd929, %rd928, 3; or.b64 %rd296, %rd927, %rd929; or.b64 %rd930, %rd926, %rd290; and.b64 %rd931, %rd1197, 12; or.b64 %rd297, %rd930, %rd931; setp.le.u64 %p714, %rd350, %rd297; @%p714 bra $L__BB0_522; mul.lo.s64 %rd942, %rd297, 56; add.s64 %rd933, %rd344, %rd942; shl.b64 %rd943, %rd296, 6; mov.u64 %rd944, _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E; add.s64 %rd945, %rd944, %rd943; ld.shared.u32 %r1608, [%rd945+20]; // begin inline asm cvta.to.global.u64 %rd932, %rd933;red.global.add.f32 [%rd932], %r1608; // end inline asm add.s64 %rd935, %rd933, 4; ld.shared.u64 %rd946, [%rd945+24]; cvt.u32.u64 %r1609, %rd946; shr.u64 %rd947, %rd946, 32; cvt.u32.u64 %r1610, %rd947; // begin inline asm cvta.to.global.u64 %rd934, %rd935;red.global.add.f32 [%rd934], %r1609; // end inline asm add.s64 %rd937, %rd933, 8; // begin inline asm cvta.to.global.u64 %rd936, %rd937;red.global.add.f32 [%rd936], %r1610; // end inline asm add.s64 %rd939, %rd933, 12; ld.shared.u32 %r1611, [%rd945+44]; // begin inline asm cvta.to.global.u64 %rd938, %rd939;red.global.add.f32 [%rd938], %r1611; // end inline asm add.s64 %rd941, %rd933, 16; ld.shared.u32 %r1612, [%rd945+40]; // begin inline asm cvta.to.global.u64 %rd940, %rd941;red.global.add.f32 [%rd940], %r1612; // end inline asm $L__BB0_522: add.s64 %rd1198, %rd1198, -1; setp.ne.s64 %p715, %rd1198, 0; mov.u64 %rd1197, %rd1204; @%p715 bra $L__BB0_520; $L__BB0_523: not.b64 %rd948, %rd284; add.s64 %rd949, %rd291, %rd948; setp.lt.u64 %p716, %rd949, 3; @%p716 bra $L__BB0_534; add.s64 %rd950, %rd1204, 3; and.b64 %rd951, %rd950, 3; and.b64 %rd952, %rd1204, 3; xor.b64 %rd953, %rd952, 2; add.s64 %rd954, %rd1204, 1; and.b64 %rd955, %rd954, 3; or.b64 %rd300, %rd952, %rd288; or.b64 %rd301, %rd952, %rd290; or.b64 %rd302, %rd955, %rd288; or.b64 %rd303, %rd955, %rd290; or.b64 %rd304, %rd953, %rd288; or.b64 %rd305, %rd953, %rd290; or.b64 %rd306, %rd951, %rd288; or.b64 %rd307, %rd951, %rd290; shr.u64 %rd1203, %rd950, 2; add.s64 %rd956, %rd1204, 2; shr.u64 %rd1202, %rd956, 2; shr.u64 %rd1201, %rd1204, 2; shr.u64 %rd1200, %rd954, 2; $L__BB0_525: and.b64 %rd317, %rd1201, 3; shl.b64 %rd957, %rd1201, 2; and.b64 %rd958, %rd957, 12; or.b64 %rd318, %rd301, %rd958; setp.le.u64 %p717, %rd350, %rd318; @%p717 bra $L__BB0_527; mul.lo.s64 %rd969, %rd318, 56; add.s64 %rd960, %rd344, %rd969; or.b64 %rd970, %rd317, %rd289; shl.b64 %rd971, %rd970, 3; or.b64 %rd972, %rd300, %rd971; shl.b64 %rd973, %rd972, 6; mov.u64 %rd974, _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E; add.s64 %rd975, %rd974, %rd973; ld.shared.u32 %r1613, [%rd975+20]; // begin inline asm cvta.to.global.u64 %rd959, %rd960;red.global.add.f32 [%rd959], %r1613; // end inline asm add.s64 %rd962, %rd960, 4; ld.shared.u64 %rd976, [%rd975+24]; cvt.u32.u64 %r1614, %rd976; shr.u64 %rd977, %rd976, 32; cvt.u32.u64 %r1615, %rd977; // begin inline asm cvta.to.global.u64 %rd961, %rd962;red.global.add.f32 [%rd961], %r1614; // end inline asm add.s64 %rd964, %rd960, 8; // begin inline asm cvta.to.global.u64 %rd963, %rd964;red.global.add.f32 [%rd963], %r1615; // end inline asm add.s64 %rd966, %rd960, 12; ld.shared.u32 %r1616, [%rd975+44]; // begin inline asm cvta.to.global.u64 %rd965, %rd966;red.global.add.f32 [%rd965], %r1616; // end inline asm add.s64 %rd968, %rd960, 16; ld.shared.u32 %r1617, [%rd975+40]; // begin inline asm cvta.to.global.u64 %rd967, %rd968;red.global.add.f32 [%rd967], %r1617; // end inline asm $L__BB0_527: and.b64 %rd319, %rd1200, 3; shl.b64 %rd978, %rd1200, 2; and.b64 %rd979, %rd978, 12; or.b64 %rd320, %rd303, %rd979; setp.le.u64 %p718, %rd350, %rd320; @%p718 bra $L__BB0_529; mul.lo.s64 %rd990, %rd320, 56; add.s64 %rd981, %rd344, %rd990; or.b64 %rd991, %rd319, %rd289; shl.b64 %rd992, %rd991, 3; or.b64 %rd993, %rd302, %rd992; shl.b64 %rd994, %rd993, 6; mov.u64 %rd995, _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E; add.s64 %rd996, %rd995, %rd994; ld.shared.u32 %r1618, [%rd996+20]; // begin inline asm cvta.to.global.u64 %rd980, %rd981;red.global.add.f32 [%rd980], %r1618; // end inline asm add.s64 %rd983, %rd981, 4; ld.shared.u64 %rd997, [%rd996+24]; cvt.u32.u64 %r1619, %rd997; shr.u64 %rd998, %rd997, 32; cvt.u32.u64 %r1620, %rd998; // begin inline asm cvta.to.global.u64 %rd982, %rd983;red.global.add.f32 [%rd982], %r1619; // end inline asm add.s64 %rd985, %rd981, 8; // begin inline asm cvta.to.global.u64 %rd984, %rd985;red.global.add.f32 [%rd984], %r1620; // end inline asm add.s64 %rd987, %rd981, 12; ld.shared.u32 %r1621, [%rd996+44]; // begin inline asm cvta.to.global.u64 %rd986, %rd987;red.global.add.f32 [%rd986], %r1621; // end inline asm add.s64 %rd989, %rd981, 16; ld.shared.u32 %r1622, [%rd996+40]; // begin inline asm cvta.to.global.u64 %rd988, %rd989;red.global.add.f32 [%rd988], %r1622; // end inline asm $L__BB0_529: and.b64 %rd321, %rd1202, 3; shl.b64 %rd999, %rd1202, 2; and.b64 %rd1000, %rd999, 12; or.b64 %rd322, %rd305, %rd1000; setp.le.u64 %p719, %rd350, %rd322; @%p719 bra $L__BB0_531; mul.lo.s64 %rd1011, %rd322, 56; add.s64 %rd1002, %rd344, %rd1011; or.b64 %rd1012, %rd321, %rd289; shl.b64 %rd1013, %rd1012, 3; or.b64 %rd1014, %rd304, %rd1013; shl.b64 %rd1015, %rd1014, 6; mov.u64 %rd1016, _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E; add.s64 %rd1017, %rd1016, %rd1015; ld.shared.u32 %r1623, [%rd1017+20]; // begin inline asm cvta.to.global.u64 %rd1001, %rd1002;red.global.add.f32 [%rd1001], %r1623; // end inline asm add.s64 %rd1004, %rd1002, 4; ld.shared.u64 %rd1018, [%rd1017+24]; cvt.u32.u64 %r1624, %rd1018; shr.u64 %rd1019, %rd1018, 32; cvt.u32.u64 %r1625, %rd1019; // begin inline asm cvta.to.global.u64 %rd1003, %rd1004;red.global.add.f32 [%rd1003], %r1624; // end inline asm add.s64 %rd1006, %rd1002, 8; // begin inline asm cvta.to.global.u64 %rd1005, %rd1006;red.global.add.f32 [%rd1005], %r1625; // end inline asm add.s64 %rd1008, %rd1002, 12; ld.shared.u32 %r1626, [%rd1017+44]; // begin inline asm cvta.to.global.u64 %rd1007, %rd1008;red.global.add.f32 [%rd1007], %r1626; // end inline asm add.s64 %rd1010, %rd1002, 16; ld.shared.u32 %r1627, [%rd1017+40]; // begin inline asm cvta.to.global.u64 %rd1009, %rd1010;red.global.add.f32 [%rd1009], %r1627; // end inline asm $L__BB0_531: add.s64 %rd1204, %rd1204, 4; and.b64 %rd324, %rd1203, 3; shl.b64 %rd1020, %rd1203, 2; and.b64 %rd1021, %rd1020, 12; or.b64 %rd325, %rd307, %rd1021; setp.le.u64 %p720, %rd350, %rd325; @%p720 bra $L__BB0_533; mul.lo.s64 %rd1032, %rd325, 56; add.s64 %rd1023, %rd344, %rd1032; or.b64 %rd1033, %rd324, %rd289; shl.b64 %rd1034, %rd1033, 3; or.b64 %rd1035, %rd306, %rd1034; shl.b64 %rd1036, %rd1035, 6; mov.u64 %rd1037, _ZN20sparkl2d_kernels_ptx4cuda5g2p2g13g2p2g_generic12shared_array6SHARED17h8458fd69631dca08E; add.s64 %rd1038, %rd1037, %rd1036; ld.shared.u32 %r1628, [%rd1038+20]; // begin inline asm cvta.to.global.u64 %rd1022, %rd1023;red.global.add.f32 [%rd1022], %r1628; // end inline asm add.s64 %rd1025, %rd1023, 4; ld.shared.u64 %rd1039, [%rd1038+24]; cvt.u32.u64 %r1629, %rd1039; shr.u64 %rd1040, %rd1039, 32; cvt.u32.u64 %r1630, %rd1040; // begin inline asm cvta.to.global.u64 %rd1024, %rd1025;red.global.add.f32 [%rd1024], %r1629; // end inline asm add.s64 %rd1027, %rd1023, 8; // begin inline asm cvta.to.global.u64 %rd1026, %rd1027;red.global.add.f32 [%rd1026], %r1630; // end inline asm add.s64 %rd1029, %rd1023, 12; ld.shared.u32 %r1631, [%rd1038+44]; // begin inline asm cvta.to.global.u64 %rd1028, %rd1029;red.global.add.f32 [%rd1028], %r1631; // end inline asm add.s64 %rd1031, %rd1023, 16; ld.shared.u32 %r1632, [%rd1038+40]; // begin inline asm cvta.to.global.u64 %rd1030, %rd1031;red.global.add.f32 [%rd1030], %r1632; // end inline asm $L__BB0_533: add.s64 %rd1203, %rd1203, 1; add.s64 %rd1202, %rd1202, 1; add.s64 %rd1201, %rd1201, 1; add.s64 %rd1200, %rd1200, 1; setp.lt.u64 %p721, %rd1204, %rd285; @%p721 bra $L__BB0_525; $L__BB0_534: ret; $L__BB0_399: setp.eq.f32 %p554, %f590, 0fBF800000; setp.eq.f32 %p555, %f595, 0f7F800000; and.pred %p556, %p554, %p555; @%p556 bra $L__BB0_404; setp.geu.f32 %p557, %f590, 0f00000000; mov.f32 %f4911, %f594; @%p557 bra $L__BB0_404; setp.eq.f32 %p558, %f592, 0f3F800000; neg.f32 %f3743, %f594; selp.f32 %f3744, %f3743, %f594, %p558; cvt.rmi.f32.f32 %f3745, %f591; setp.neu.f32 %p559, %f3745, %f591; selp.f32 %f4911, 0f7FFFFFFF, %f3744, %p559; bra.uni $L__BB0_404; $L__BB0_328: setp.eq.f32 %p456, %f416, 0fBF800000; setp.eq.f32 %p457, %f421, 0f7F800000; and.pred %p458, %p456, %p457; @%p458 bra $L__BB0_333; setp.geu.f32 %p459, %f416, 0f00000000; mov.f32 %f4877, %f420; @%p459 bra $L__BB0_333; setp.eq.f32 %p460, %f418, 0f3F800000; neg.f32 %f3220, %f420; selp.f32 %f3221, %f3220, %f420, %p460; cvt.rmi.f32.f32 %f3222, %f417; setp.neu.f32 %p461, %f3222, %f417; selp.f32 %f4877, 0f7FFFFFFF, %f3221, %p461; bra.uni $L__BB0_333; $L__BB0_208: abs.f32 %f4779, %f219; setp.eq.f32 %p303, %f219, 0f00000000; setp.eq.f32 %p304, %f4779, 0f7F800000; or.pred %p305, %p303, %p304; @%p305 bra $L__BB0_211; bra.uni $L__BB0_209; $L__BB0_211: setp.eq.f32 %p308, %f257, 0f3F800000; add.f32 %f2298, %f219, %f219; mov.b32 %r860, %f2298; and.b32 %r861, %r860, 2147483647; selp.b32 %r862, %r860, %r861, %p308; mov.b32 %f4844, %r862; bra.uni $L__BB0_213; $L__BB0_217: setp.lt.f32 %p314, %f264, 0f00800000; mul.f32 %f2335, %f264, 0f4B800000; selp.f32 %f2336, %f2335, %f264, %p314; mov.b32 %r875, %f2336; add.s32 %r876, %r875, -1060439283; and.b32 %r877, %r876, -8388608; sub.s32 %r878, %r875, %r877; mov.b32 %f2337, %r878; cvt.rn.f32.s32 %f2338, %r877; selp.f32 %f2339, 0fC1C00000, 0f00000000, %p314; mov.f32 %f2340, 0f34000000; fma.rn.f32 %f2341, %f2338, %f2340, %f2339; add.f32 %f2342, %f2337, 0fBF800000; add.f32 %f2334, %f2337, 0f3F800000; mov.f32 %f2343, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f2333,%f2334; // end inline asm add.f32 %f2344, %f2342, %f2342; mul.f32 %f2345, %f2333, %f2344; mul.f32 %f2346, %f2345, %f2345; neg.f32 %f2347, %f2345; sub.f32 %f2348, %f2342, %f2345; add.f32 %f2349, %f2348, %f2348; fma.rn.f32 %f2350, %f2347, %f2342, %f2349; mul.rn.f32 %f2351, %f2333, %f2350; mov.f32 %f2352, 0f3B52E7DB; mov.f32 %f2353, 0f3A2C32E4; fma.rn.f32 %f2354, %f2353, %f2346, %f2352; mov.f32 %f2355, 0f3C93BB73; fma.rn.f32 %f2356, %f2354, %f2346, %f2355; mov.f32 %f2357, 0f3DF6384F; fma.rn.f32 %f2358, %f2356, %f2346, %f2357; mul.rn.f32 %f2359, %f2358, %f2346; mov.f32 %f2360, 0f3FB8AA3B; fma.rn.f32 %f2361, %f2345, %f2360, %f2341; mul.f32 %f2362, %f2359, 0f40400000; sub.f32 %f2363, %f2341, %f2361; fma.rn.f32 %f2364, %f2345, %f2360, %f2363; fma.rn.f32 %f2365, %f2351, %f2360, %f2364; mov.f32 %f2366, 0f32A55E34; fma.rn.f32 %f2367, %f2345, %f2366, %f2365; fma.rn.f32 %f2368, %f2362, %f2351, %f2367; fma.rn.f32 %f2369, %f2359, %f2345, %f2368; add.rn.f32 %f2370, %f2361, %f2369; mov.f32 %f2371, 0f3F000000; mul.rn.f32 %f2372, %f2370, %f2371; cvt.rni.f32.f32 %f2373, %f2372; sub.f32 %f2374, %f2372, %f2373; neg.f32 %f2375, %f2372; fma.rn.f32 %f2376, %f2370, %f2371, %f2375; neg.f32 %f2377, %f2361; add.rn.f32 %f2378, %f2370, %f2377; neg.f32 %f2379, %f2378; add.rn.f32 %f2380, %f2369, %f2379; fma.rn.f32 %f2381, %f2380, %f2371, %f2376; add.f32 %f2382, %f2381, %f2374; setp.gt.f32 %p315, %f2373, 0f00000000; selp.b32 %r879, 0, -2097152000, %p315; setp.geu.f32 %p316, %f263, 0f00000000; setp.lt.f32 %p317, %f2372, 0f00000000; selp.f32 %f2383, 0f00000000, 0f7F800000, %p317; abs.f32 %f2384, %f2372; setp.gt.f32 %p318, %f2384, 0f43180000; cvt.rzi.s32.f32 %r880, %f2373; shl.b32 %r881, %r880, 23; sub.s32 %r882, %r881, %r879; mov.b32 %f2385, %r882; add.s32 %r883, %r879, 2130706432; mov.b32 %f2386, %r883; mov.f32 %f2387, 0f3AAF85ED; mov.f32 %f2388, 0f391FCB8E; fma.rn.f32 %f2389, %f2388, %f2382, %f2387; mov.f32 %f2390, 0f3C1D9856; fma.rn.f32 %f2391, %f2389, %f2382, %f2390; mov.f32 %f2392, 0f3D6357BB; fma.rn.f32 %f2393, %f2391, %f2382, %f2392; mov.f32 %f2394, 0f3E75FDEC; fma.rn.f32 %f2395, %f2393, %f2382, %f2394; mov.f32 %f2396, 0f3F317218; fma.rn.f32 %f2397, %f2395, %f2382, %f2396; fma.rn.f32 %f2398, %f2397, %f2382, %f2343; mul.f32 %f2399, %f2398, %f2386; mul.f32 %f2400, %f2399, %f2385; selp.f32 %f4845, %f2383, %f2400, %p318; @%p316 bra $L__BB0_221; mov.f32 %f4845, 0f7FFFFFFF; $L__BB0_221: ld.global.u8 %rs25, [%rd81+48]; setp.eq.s16 %p320, %rs25, 0; @%p320 bra $L__BB0_225; div.rn.f32 %f2410, %f219, %f263; setp.lt.f32 %p321, %f2410, 0f00800000; mul.f32 %f2411, %f2410, 0f4B000000; selp.f32 %f269, %f2411, %f2410, %p321; selp.f32 %f2412, 0fC1B80000, 0f00000000, %p321; mov.b32 %r887, %f269; add.s32 %r888, %r887, -1059760811; and.b32 %r889, %r888, -8388608; sub.s32 %r890, %r887, %r889; mov.b32 %f2413, %r890; cvt.rn.f32.s32 %f2414, %r889; mov.f32 %f2415, 0f34000000; fma.rn.f32 %f2416, %f2414, %f2415, %f2412; add.f32 %f2417, %f2413, 0fBF800000; mov.f32 %f2418, 0f3E1039F6; mov.f32 %f2419, 0fBE055027; fma.rn.f32 %f2420, %f2419, %f2417, %f2418; mov.f32 %f2421, 0fBDF8CDCC; fma.rn.f32 %f2422, %f2420, %f2417, %f2421; mov.f32 %f2423, 0f3E0F2955; fma.rn.f32 %f2424, %f2422, %f2417, %f2423; mov.f32 %f2425, 0fBE2AD8B9; fma.rn.f32 %f2426, %f2424, %f2417, %f2425; mov.f32 %f2427, 0f3E4CED0B; fma.rn.f32 %f2428, %f2426, %f2417, %f2427; mov.f32 %f2429, 0fBE7FFF22; fma.rn.f32 %f2430, %f2428, %f2417, %f2429; mov.f32 %f2431, 0f3EAAAA78; fma.rn.f32 %f2432, %f2430, %f2417, %f2431; mov.f32 %f2433, 0fBF000000; fma.rn.f32 %f2434, %f2432, %f2417, %f2433; mul.f32 %f2435, %f2417, %f2434; fma.rn.f32 %f2436, %f2435, %f2417, %f2417; mov.f32 %f2437, 0f3F317218; fma.rn.f32 %f4846, %f2416, %f2437, %f2436; setp.lt.u32 %p322, %r887, 2139095040; @%p322 bra $L__BB0_224; mov.f32 %f2438, 0f7F800000; fma.rn.f32 %f4846, %f269, %f2438, %f2438; $L__BB0_224: setp.eq.f32 %p323, %f269, 0f00000000; selp.f32 %f2439, 0fFF800000, %f4846, %p323; add.f32 %f4851, %f4851, %f2439; $L__BB0_225: mov.b64 {%r891, %r892}, %rd156; mov.b64 {%r893, %r894}, %rd155; mov.b32 %f2440, %r893; mul.f32 %f2441, %f2440, %f4845; mov.b32 %f2442, %r894; mul.f32 %f2443, %f2442, %f4845; mov.b32 %f2444, %r891; mul.f32 %f2445, %f2444, %f4845; mov.b32 %f2446, %r892; mul.f32 %f2447, %f2446, %f4845; mov.b64 {%r895, %r896}, %rd158; mov.b64 {%r897, %r898}, %rd157; mov.b32 %f2448, %r897; mov.b32 %f2449, %r898; mul.f32 %f2450, %f2449, %f2445; mul.f32 %f2451, %f2449, %f2447; mov.b32 %f2452, %r895; mov.b32 %f2453, %r896; mul.f32 %f2454, %f2453, %f2445; mul.f32 %f2455, %f2453, %f2447; fma.rn.f32 %f2456, %f2448, %f2443, %f2451; mov.b32 %r899, %f2456; fma.rn.f32 %f2457, %f2448, %f2441, %f2450; mov.b32 %r900, %f2457; fma.rn.f32 %f2458, %f2452, %f2443, %f2455; mov.b32 %r901, %f2458; fma.rn.f32 %f2459, %f2452, %f2441, %f2454; mov.b32 %r902, %f2459; mov.b64 %rd1158, {%r902, %r901}; mov.b64 %rd1157, {%r900, %r899}; bra.uni $L__BB0_238; $L__BB0_209: setp.geu.f32 %p306, %f219, 0f00000000; mov.f32 %f4844, %f258; @%p306 bra $L__BB0_213; setp.eq.f32 %p307, %f257, 0f3F800000; neg.f32 %f2297, %f258; selp.f32 %f4844, %f2297, %f258, %p307; $L__BB0_213: div.rn.f32 %f2300, %f4844, %f217; sqrt.rn.f32 %f2301, %f256; mul.f32 %f2302, %f2301, %f2300; div.rn.f32 %f2303, %f229, %f240; div.rn.f32 %f2304, %f230, %f240; fma.rn.f32 %f2305, %f2303, %f2302, %f228; fma.rn.f32 %f2306, %f2304, %f2302, %f228; sqrt.rn.f32 %f2307, %f2305; sqrt.rn.f32 %f2308, %f2306; mov.b64 {%r863, %r864}, %rd156; mov.b64 {%r865, %r866}, %rd155; mov.b32 %f2309, %r865; mul.f32 %f2310, %f2309, %f2307; mov.b32 %f2311, %r866; mul.f32 %f2312, %f2311, %f2307; mov.b32 %f2313, %r863; mul.f32 %f2314, %f2313, %f2308; mov.b32 %f2315, %r864; mul.f32 %f2316, %f2315, %f2308; mov.b64 {%r867, %r868}, %rd158; mov.b64 {%r869, %r870}, %rd157; mov.b32 %f2317, %r869; mov.b32 %f2318, %r870; mul.f32 %f2319, %f2318, %f2314; mul.f32 %f2320, %f2318, %f2316; mov.b32 %f2321, %r867; mov.b32 %f2322, %r868; mul.f32 %f2323, %f2322, %f2314; mul.f32 %f2324, %f2322, %f2316; fma.rn.f32 %f2325, %f2317, %f2312, %f2320; mov.b32 %r871, %f2325; fma.rn.f32 %f2326, %f2317, %f2310, %f2319; mov.b32 %r872, %f2326; fma.rn.f32 %f2327, %f2321, %f2312, %f2324; mov.b32 %r873, %f2327; fma.rn.f32 %f2328, %f2321, %f2310, %f2323; mov.b32 %r874, %f2328; mov.b64 %rd1158, {%r874, %r873}; mov.b64 %rd1157, {%r872, %r871}; $L__BB0_238: mov.b64 {%r931, %r932}, %rd1157; mov.b32 %f4864, %r932; mov.b64 {%r933, %r934}, %rd1158; mov.b32 %f4863, %r933; st.local.v4.u32 [%rd77], {%r931, %r932, %r933, %r934}; st.f32 [%rd144], %f4851; bra.uni $L__BB0_297; $L__BB0_536: trap; $L__BB0_535: trap; $L__BB0_159: trap; $L__BB0_388: trap; } // .globl grid_update .visible .entry grid_update( .param .f32 grid_update_param_0, .param .align 8 .b8 grid_update_param_1[72], .param .u64 grid_update_param_2, .param .u64 grid_update_param_3, .param .align 4 .b8 grid_update_param_4[8] ) { .local .align 16 .b8 __local_depot1[736]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<2964>; .reg .b16 %rs<1043>; .reg .f32 %f<5498>; .reg .b32 %r<5440>; .reg .b64 %rd<13086>; mov.u64 %SPL, __local_depot1; cvta.local.u64 %SP, %SPL; ld.param.f32 %f1578, [grid_update_param_0]; ld.param.u64 %rd5233, [grid_update_param_2]; ld.param.u64 %rd40, [grid_update_param_3]; ld.param.f32 %f1581, [grid_update_param_4+4]; ld.param.f32 %f1580, [grid_update_param_4]; ld.param.u64 %rd5232, [grid_update_param_1+64]; ld.param.u64 %rd5227, [grid_update_param_1+16]; ld.param.u64 %rd5226, [grid_update_param_1+8]; ld.param.f32 %f1579, [grid_update_param_1]; add.u64 %rd5235, %SP, 0; add.u64 %rd1, %SPL, 0; add.u64 %rd2, %SPL, 0; add.u64 %rd3, %SPL, 0; add.u64 %rd4, %SPL, 0; add.u64 %rd5, %SPL, 0; add.u64 %rd6, %SPL, 0; add.u64 %rd5241, %SP, 560; add.u64 %rd7, %SPL, 560; add.u64 %rd8, %SPL, 704; add.u64 %rd9, %SPL, 32; add.u64 %rd5244, %SP, 552; add.u64 %rd10, %SPL, 552; mov.u32 %r1634, %tid.y; mov.u32 %r1635, %tid.x; mov.u32 %r1636, %ctaid.x; cvt.u64.u32 %rd11, %r1636; mul.wide.u32 %rd5245, %r1636, 16; cvt.u64.u32 %rd5246, %r1635; add.s64 %rd5247, %rd5246, %rd5245; mul.wide.u32 %rd5248, %r1634, 4; add.s64 %rd5249, %rd5247, %rd5248; setp.le.u64 %p115, %rd5232, %rd5249; @%p115 bra $L__BB1_2148; cvta.to.global.u64 %rd5250, %rd5227; mul.lo.s64 %rd5251, %rd11, 24; add.s64 %rd5252, %rd5250, %rd5251; ld.global.u64 %rd5253, [%rd5252]; cvta.to.global.u64 %rd5254, %rd5226; shr.u64 %rd5255, %rd5253, 30; and.b64 %rd5256, %rd5255, 17179869180; cvt.u64.u32 %rd5257, %r1634; add.s64 %rd5258, %rd5257, %rd5256; add.s64 %rd5259, %rd5258, -8589934592; shl.b64 %rd5260, %rd5253, 2; and.b64 %rd5261, %rd5260, 17179869180; add.s64 %rd5263, %rd5246, %rd5261; add.s64 %rd5264, %rd5263, -8589934592; mul.lo.s64 %rd5269, %rd5249, 56; cvt.rn.f32.s64 %f1582, %rd5264; cvt.rn.f32.s64 %f1583, %rd5259; mul.f32 %f2, %f1579, %f1582; mul.f32 %f3, %f1579, %f1583; add.s64 %rd5270, %rd5254, %rd5269; add.s64 %rd13, %rd5270, 4; ld.global.f32 %f1584, [%rd5270]; mul.f32 %f1585, %f1580, %f1584; mul.f32 %f1586, %f1581, %f1584; ld.global.u32 %rd5271, [%rd5270+4]; ld.global.u32 %rd5272, [%rd5270+8]; bfi.b64 %rd5273, %rd5272, %rd5271, 32, 32; cvt.u32.u64 %r1640, %rd5273; shr.u64 %rd5274, %rd5273, 32; cvt.u32.u64 %r1641, %rd5274; mov.b32 %f1587, %r1640; fma.rn.f32 %f1588, %f1585, %f1578, %f1587; mov.b32 %f1589, %r1641; fma.rn.f32 %f1590, %f1586, %f1578, %f1589; setp.eq.f32 %p116, %f1584, 0f00000000; rcp.rn.f32 %f1591, %f1584; selp.f32 %f1592, 0f00000000, %f1591, %p116; mul.f32 %f4, %f1592, %f1588; mul.f32 %f5, %f1592, %f1590; ld.global.u64 %rd13070, [%rd5270+24]; setp.ne.s64 %p117, %rd13070, 0; @%p117 bra $L__BB1_2132; cvta.to.global.u64 %rd11381, %rd5233; add.f32 %f6, %f1579, %f1579; add.s64 %rd17, %rd1, 8; add.s64 %rd18, %rd2, 8; add.s64 %rd19, %rd3, 8; add.s64 %rd20, %rd4, 8; add.s64 %rd21, %rd5, 8; add.s64 %rd22, %rd6, 8; add.s64 %rd23, %rd10, 8; add.u64 %rd24, %SPL, 16; mov.u16 %rs1, 2; mov.u64 %rd11388, 0; mov.u64 %rd42, %rd5233; bra.uni $L__BB1_3; $L__BB1_237: add.s64 %rd11381, %rd35, 280; and.b16 %rs1, %rs23, 1; mov.b32 %r1969, %f175; cvt.u64.u32 %rd5876, %r1969; mov.b32 %r1970, %f174; cvt.u64.u32 %rd5877, %r1970; bfi.b64 %rd30, %rd5876, %rd5877, 32, 32; add.s64 %rd11388, %rd38, 1; mov.u64 %rd29, %rd38; $L__BB1_3: and.b16 %rs210, %rs1, 255; setp.eq.s16 %p118, %rs210, 2; selp.f32 %f8, 0f7F7FFFFF, %f5312, %p118; $L__BB1_5: mov.u64 %rd38, %rd11388; mov.u64 %rd35, %rd11381; setp.eq.s64 %p119, %rd40, 0; @%p119 bra $L__BB1_238; add.s64 %rd40, %rd40, -1; setp.eq.s64 %p120, %rd42, 0; @%p120 bra $L__BB1_238; add.s64 %rd11381, %rd35, 280; add.s64 %rd42, %rd42, 280; ld.global.u32 %r1642, [%rd35+272]; setp.eq.s32 %p121, %r1642, 3; add.s64 %rd11388, %rd38, 1; @%p121 bra $L__BB1_5; ld.global.u16 %rs211, [%rd35]; setp.eq.s16 %p122, %rs211, 1; @%p122 bra $L__BB1_179; setp.eq.s16 %p123, %rs211, 2; @%p123 bra $L__BB1_68; setp.ne.s16 %p124, %rs211, 3; @%p124 bra $L__BB1_217; ld.global.u8 %rs2, [%rd35+24]; ld.global.f32 %f9, [%rd35+256]; sub.f32 %f1594, %f2, %f9; ld.global.f32 %f10, [%rd35+260]; sub.f32 %f1595, %f3, %f10; ld.global.f32 %f1596, [%rd35+252]; ld.global.f32 %f11, [%rd35+248]; mul.f32 %f1597, %f1595, %f1596; fma.rn.f32 %f12, %f1594, %f11, %f1597; mul.f32 %f1598, %f1594, %f1596; mul.f32 %f1599, %f1595, %f11; sub.f32 %f13, %f1599, %f1598; add.u64 %rd45, %SPL, 32; mov.u32 %r23, 2; st.local.u32 [%rd45+20], %r23; ld.global.u64 %rd46, [%rd35+16]; setp.eq.s64 %p125, %rd46, 0; @%p125 bra $L__BB1_65; ld.global.u64 %rd47, [%rd35+8]; mov.u64 %rd11390, 1; bra.uni $L__BB1_13; $L__BB1_21: sub.f32 %f1611, %f5314, %f12; abs.f32 %f30, %f1611; setp.le.f32 %p135, %f30, 0f34000000; @%p135 bra $L__BB1_23; abs.f32 %f1612, %f5314; abs.f32 %f1613, %f12; setp.gt.f32 %p137, %f1613, %f1612; selp.f32 %f1614, %f1613, %f1612, %p137; mul.f32 %f1615, %f1614, 0f34000000; setp.gtu.f32 %p138, %f30, %f1615; @%p138 bra $L__BB1_27; bra.uni $L__BB1_23; $L__BB1_13: shl.b64 %rd5281, %rd11390, 3; add.s64 %rd5282, %rd47, %rd5281; setp.eq.s64 %p126, %rd11390, %rd46; selp.b64 %rd5283, 0, %rd11390, %p126; shl.b64 %rd5284, %rd5283, 3; add.s64 %rd5285, %rd47, %rd5284; ld.u32 %rd5286, [%rd5282+-8]; ld.u32 %rd5287, [%rd5282+-4]; bfi.b64 %rd50, %rd5287, %rd5286, 32, 32; ld.u32 %rd5288, [%rd5285]; ld.u32 %rd5289, [%rd5285+4]; bfi.b64 %rd51, %rd5289, %rd5288, 32, 32; cvt.u32.u64 %r6, %rd50; mov.b32 %f17, %r6; shr.u64 %rd5290, %rd50, 32; cvt.u32.u64 %r1663, %rd5290; mov.b32 %f18, %r1663; cvt.u32.u64 %r7, %rd51; shr.u64 %rd5291, %rd51, 32; cvt.u32.u64 %r1664, %rd5291; mov.b32 %f19, %r7; sub.f32 %f20, %f19, %f17; mov.b32 %f1601, %r1664; sub.f32 %f21, %f1601, %f18; sub.f32 %f1602, %f12, %f17; sub.f32 %f1603, %f13, %f18; mul.f32 %f1604, %f21, %f1603; fma.rn.f32 %f22, %f20, %f1602, %f1604; mul.f32 %f1605, %f21, %f21; fma.rn.f32 %f1606, %f20, %f20, %f1605; add.f32 %f23, %f1606, 0f00000000; setp.gtu.f32 %p127, %f22, 0f00000000; mov.b64 {%r1665, %r4779}, %rd50; mov.b64 {%r1666, %r9}, %rd51; @%p127 bra $L__BB1_15; bra.uni $L__BB1_14; $L__BB1_15: setp.ltu.f32 %p128, %f22, %f23; @%p128 bra $L__BB1_17; bra.uni $L__BB1_16; $L__BB1_17: setp.eq.f32 %p129, %f23, 0f00000000; @%p129 bra $L__BB1_64; mov.b32 %f5266, %r1663; shr.u64 %rd11342, %rd51, 32; cvt.u32.u64 %r4748, %rd11342; shr.u64 %rd11341, %rd50, 32; cvt.u32.u64 %r4747, %rd11341; mov.b32 %f5265, %r4747; mov.b32 %f5264, %r4748; sub.f32 %f5263, %f5264, %f5265; mov.b32 %f5262, %r6; cvt.u32.u64 %r4746, %rd51; cvt.u32.u64 %r4745, %rd50; mov.b32 %f5261, %r4745; mov.b32 %f5260, %r4746; sub.f32 %f5259, %f5260, %f5261; div.rn.f32 %f1607, %f22, %f23; mov.f32 %f1608, 0f3F800000; sub.f32 %f1609, %f1608, %f1607; mov.b32 %r4781, %f1609; mov.b32 %r4782, %f1607; fma.rn.f32 %f5314, %f5259, %f1607, %f5261; mov.b32 %r4778, %f5314; fma.rn.f32 %f5315, %f5263, %f1607, %f5265; mov.b32 %r4779, %f5315; mov.u32 %r4780, 1; bra.uni $L__BB1_19; $L__BB1_14: cvt.u32.u64 %r4778, %rd50; mov.b32 %f5314, %r4778; mov.b32 %f5315, %r4779; mov.u32 %r4780, 0; mov.u32 %r4781, %r4780; bra.uni $L__BB1_19; $L__BB1_16: cvt.u32.u64 %r4744, %rd51; cvt.u32.u64 %r4778, %rd51; mov.b32 %f5314, %r4778; mov.b32 %f5315, %r9; mov.u32 %r4781, 1; mov.u32 %r4780, 0; mov.u32 %r4779, %r9; $L__BB1_19: setp.eq.f32 %p130, %f12, %f5314; @%p130 bra $L__BB1_23; bra.uni $L__BB1_20; $L__BB1_23: setp.eq.f32 %p140, %f5315, %f13; mov.pred %p139, -1; mov.pred %p2907, %p139; @%p140 bra $L__BB1_27; mov.b32 %r4742, %f13; and.b32 %r4741, %r4742, 2147483647; mov.b32 %f5257, %r4741; setp.eq.f32 %p142, %f5257, 0f7F800000; and.b32 %r1675, %r4779, 2147483647; mov.b32 %f1616, %r1675; setp.eq.f32 %p143, %f1616, 0f7F800000; or.pred %p144, %p142, %p143; mov.pred %p2907, 0; @%p144 bra $L__BB1_27; sub.f32 %f1617, %f5315, %f13; abs.f32 %f31, %f1617; setp.le.f32 %p146, %f31, 0f34000000; mov.pred %p2907, %p139; @%p146 bra $L__BB1_27; abs.f32 %f1618, %f5315; abs.f32 %f1619, %f13; setp.gt.f32 %p147, %f1619, %f1618; selp.f32 %f1620, %f1619, %f1618, %p147; mul.f32 %f1621, %f1620, 0f34000000; setp.le.f32 %p2907, %f31, %f1621; bra.uni $L__BB1_27; $L__BB1_20: mov.b32 %r4740, %f12; and.b32 %r4739, %r4740, 2147483647; mov.b32 %f5256, %r4739; setp.eq.f32 %p132, %f5256, 0f7F800000; and.b32 %r1674, %r4778, 2147483647; mov.b32 %f1610, %r1674; setp.eq.f32 %p133, %f1610, 0f7F800000; or.pred %p134, %p132, %p133; mov.pred %p2907, 0; @%p134 bra $L__BB1_27; bra.uni $L__BB1_21; $L__BB1_27: cvt.u64.u32 %rd5292, %r4779; cvt.u64.u32 %rd5293, %r4778; bfi.b64 %rd52, %rd5292, %rd5293, 32, 32; mov.b64 {%r1676, %r1677}, %rd52; selp.u64 %rd53, 1, 0, %p2907; mov.b32 %f33, %r1677; mov.b32 %f32, %r1676; sub.f32 %f1622, %f32, %f12; sub.f32 %f1623, %f33, %f13; mul.f32 %f1624, %f1623, %f1623; fma.rn.f32 %f1625, %f1622, %f1622, %f1624; add.f32 %f1626, %f1625, 0f00000000; sqrt.rn.f32 %f35, %f1626; setp.geu.f32 %p148, %f35, %f5316; setp.ne.s32 %p149, %r23, 2; and.pred %p150, %p149, %p148; @%p150 bra $L__BB1_29; shr.u64 %rd11340, %rd51, 32; shr.u64 %rd11339, %rd50, 32; add.s64 %rd11391, %rd11390, -1; st.local.u64 [%rd45], %rd11391; st.local.v2.f32 [%rd45+8], {%f32, %f33}; mov.b64 {%r1680, %r1681}, %rd53; st.local.v2.u32 [%rd45+16], {%r1680, %r4780}; st.local.v2.u32 [%rd45+24], {%r4781, %r4782}; st.local.f32 [%rd45+32], %f35; st.local.u32 [%rd45+36], %rd50; st.local.u32 [%rd45+44], %rd51; st.local.u32 [%rd45+40], %rd11339; st.local.u32 [%rd45+48], %rd11340; mov.f32 %f5316, %f35; mov.u32 %r23, %r4780; $L__BB1_29: add.s64 %rd56, %rd11390, 1; setp.lt.u64 %p151, %rd11390, %rd46; mov.u64 %rd11390, %rd56; @%p151 bra $L__BB1_13; ld.local.u32 %rd5300, [%rd45+36]; ld.local.u32 %rd5301, [%rd45+40]; bfi.b64 %rd5302, %rd5301, %rd5300, 32, 32; mov.u64 %rd5299, 0; cvt.u32.u64 %r1682, %rd5302; mov.b32 %f1627, %r1682; shr.u64 %rd5303, %rd5302, 32; cvt.u32.u64 %r1683, %rd5303; mov.b32 %f1628, %r1683; ld.local.u32 %rd5304, [%rd45+44]; ld.local.u32 %rd5305, [%rd45+48]; bfi.b64 %rd5306, %rd5305, %rd5304, 32, 32; cvt.u32.u64 %r1684, %rd5306; shr.u64 %rd5307, %rd5306, 32; cvt.u32.u64 %r1685, %rd5307; mov.b32 %f1629, %r1684; sub.f32 %f37, %f1629, %f1627; mov.b32 %f1630, %r1685; sub.f32 %f38, %f1630, %f1628; mul.f32 %f1631, %f38, %f38; fma.rn.f32 %f1632, %f37, %f37, %f1631; add.f32 %f39, %f1632, 0f00000000; setp.leu.f32 %p152, %f39, 0f28800000; mov.u64 %rd11392, %rd5299; mov.u64 %rd11393, %rd5299; mov.u64 %rd11394, %rd5299; @%p152 bra $L__BB1_32; neg.f32 %f1633, %f37; sqrt.rn.f32 %f1634, %f39; div.rn.f32 %f1635, %f38, %f1634; div.rn.f32 %f1636, %f1633, %f1634; mov.b32 %r1686, %f1636; mov.b32 %r1687, %f1635; mov.u64 %rd11394, 1; mov.b64 %rd5310, {%r1687, %r1686}; shr.u64 %rd11393, %rd5310, 32; shl.b64 %rd11392, %rd5310, 32; $L__BB1_32: or.b64 %rd63, %rd11394, %rd11392; or.b64 %rd64, %rd5299, %rd11393; and.b64 %rd5311, %rd5299, 4294967295; xor.b64 %rd5312, %rd11394, 1; or.b64 %rd5313, %rd5312, %rd5311; setp.ne.s64 %p153, %rd5313, 0; @%p153 bra $L__BB1_63; mov.b64 {%r1688, %r1689}, %rd64; mov.b64 {%r1690, %r1691}, %rd63; mov.b32 %f40, %r1691; mov.b32 %f41, %r1688; setp.eq.s32 %p154, %r23, 1; @%p154 bra $L__BB1_61; bra.uni $L__BB1_34; $L__BB1_61: ld.local.u64 %rd5396, [%rd45+8]; cvt.u32.u64 %r1712, %rd5396; mov.b32 %f1664, %r1712; shr.u64 %rd5397, %rd5396, 32; cvt.u32.u64 %r1713, %rd5397; mov.b32 %f1665, %r1713; sub.f32 %f1666, %f2, %f1664; sub.f32 %f1667, %f3, %f1665; mul.f32 %f1668, %f41, %f1667; fma.rn.f32 %f1669, %f40, %f1666, %f1668; setp.le.f32 %p2908, %f1669, 0f00000000; bra.uni $L__BB1_62; $L__BB1_68: ld.global.f32 %f1680, [%rd35+256]; mov.u64 %rd11548, 0; sub.f32 %f1681, %f2, %f1680; ld.global.f32 %f1682, [%rd35+260]; sub.f32 %f1683, %f3, %f1682; ld.global.f32 %f1684, [%rd35+252]; ld.global.f32 %f1685, [%rd35+248]; mul.f32 %f1686, %f1683, %f1684; fma.rn.f32 %f49, %f1681, %f1685, %f1686; mul.f32 %f1687, %f1681, %f1684; mul.f32 %f1688, %f1683, %f1685; sub.f32 %f50, %f1688, %f1687; mov.b32 %r1721, %f49; mov.b32 %r1722, %f50; cvt.u64.u32 %rd5418, %r1722; cvt.u64.u32 %rd5419, %r1721; bfi.b64 %rd5420, %rd5418, %rd5419, 32, 32; st.local.u64 [%rd10], %rd5420; ld.global.u64 %rd166, [%rd35+32]; setp.eq.s64 %p175, %rd166, 0; mov.u64 %rd11549, 2; mov.u64 %rd11550, %rd11548; @%p175 bra $L__BB1_174; mov.u32 %r1729, 0; st.local.u32 [%rd9], %r1729; mov.u32 %r1730, -16777217; st.local.u32 [%rd9+4], %r1730; mov.u32 %r45, 1; st.local.u32 [%rd9+512], %r45; ld.global.u64 %rd168, [%rd35+24]; ld.global.u64 %rd169, [%rd35+80]; ld.global.u64 %rd170, [%rd35+72]; mov.u32 %r43, 2139095039; mov.u32 %r42, 4; bra.uni $L__BB1_71; $L__BB1_179: ld.global.f32 %f115, [%rd35+256]; sub.f32 %f1887, %f2, %f115; ld.global.f32 %f116, [%rd35+260]; sub.f32 %f1888, %f3, %f116; ld.global.f32 %f1889, [%rd35+252]; ld.global.f32 %f117, [%rd35+248]; mul.f32 %f1890, %f1888, %f1889; fma.rn.f32 %f118, %f1887, %f117, %f1890; mul.f32 %f1891, %f1887, %f1889; mul.f32 %f1892, %f1888, %f117; sub.f32 %f119, %f1892, %f1891; mov.b32 %r146, %f118; mov.b32 %r147, %f119; ld.global.u64 %rd528, [%rd35+56]; ld.global.u64 %rd527, [%rd35+48]; sub.f32 %f1893, %f118, %f6; sub.f32 %f1894, %f119, %f6; mov.b32 %r1901, %f1893; mov.b32 %r1902, %f1894; cvt.u64.u32 %rd5795, %r1902; cvt.u64.u32 %rd5796, %r1901; add.f32 %f1895, %f6, %f118; add.f32 %f1896, %f6, %f119; mov.b32 %r1903, %f1895; mov.b32 %r1904, %f1896; cvt.u64.u32 %rd5797, %r1904; cvt.u64.u32 %rd5798, %r1903; bfi.b64 %rd5799, %rd5795, %rd5796, 32, 32; mov.b64 {%r1905, %r1906}, %rd5799; bfi.b64 %rd5800, %rd5797, %rd5798, 32, 32; mov.b64 {%r1907, %r1908}, %rd5800; add.u64 %rd529, %SPL, 32; mov.u16 %rs277, 2; st.local.u8 [%rd529+8], %rs277; mov.b32 %f123, %r1908; mov.b32 %f121, %r1906; mov.b32 %f122, %r1907; mov.b32 %f120, %r1905; ld.global.v2.f32 {%f1897, %f1898}, [%rd35+40]; div.rn.f32 %f126, %f120, %f1897; div.rn.f32 %f127, %f122, %f1897; ld.global.u64 %rd530, [%rd35+16]; cvt.rn.f32.u64 %f1899, %rd530; add.f32 %f1900, %f1899, 0fBF800000; rcp.rn.f32 %f128, %f1900; setp.lt.f32 %p342, %f127, 0fBF000000; setp.gt.f32 %p343, %f126, 0f3F000000; or.pred %p344, %p343, %p342; @%p344 bra $L__BB1_211; add.f32 %f1901, %f126, 0f3F000000; div.rn.f32 %f1902, %f1901, %f128; cvt.rmi.f32.f32 %f1903, %f1902; add.s64 %rd5802, %rd530, -2; cvt.rn.f32.u64 %f1904, %rd5802; setp.gt.f32 %p345, %f1903, 0f00000000; setp.lt.f32 %p346, %f1903, %f1904; selp.f32 %f1905, %f1903, %f1904, %p346; selp.f32 %f1906, %f1905, 0f00000000, %p345; setp.gt.f32 %p347, %f1906, 0f5F7FFFFF; max.f32 %f1907, %f1906, 0f00000000; cvt.rzi.u64.f32 %rd5803, %f1907; selp.b64 %rd536, -1, %rd5803, %p347; add.f32 %f1908, %f127, 0f3F000000; div.rn.f32 %f1909, %f1908, %f128; cvt.rpi.f32.f32 %f1910, %f1909; add.s64 %rd5804, %rd530, -1; cvt.rn.f32.u64 %f1911, %rd5804; setp.gt.f32 %p348, %f1910, 0f00000000; setp.lt.f32 %p349, %f1910, %f1911; selp.f32 %f1912, %f1910, %f1911, %p349; selp.f32 %f1913, %f1912, 0f00000000, %p348; setp.gt.f32 %p350, %f1913, 0f5F7FFFFF; max.f32 %f1914, %f1913, 0f00000000; cvt.rzi.u64.f32 %rd5805, %f1914; selp.b64 %rd532, -1, %rd5805, %p350; setp.ge.u64 %p351, %rd536, %rd532; @%p351 bra $L__BB1_211; div.rn.f32 %f129, %f121, %f1898; div.rn.f32 %f130, %f123, %f1898; ld.global.u64 %rd533, [%rd35+32]; ld.global.u64 %rd534, [%rd35+24]; ld.global.u64 %rd535, [%rd35+8]; ld.local.v4.u32 {%r4843, %r4844, %r4845, %r1914}, [%rd529]; mov.f32 %f5328, 0f7F7FFFFF; $L__BB1_182: setp.gt.u64 %p352, %rd533, %rd536; @%p352 bra $L__BB1_184; bra.uni $L__BB1_183; $L__BB1_184: add.s64 %rd5806, %rd534, %rd536; ld.u8 %rs278, [%rd5806]; setp.eq.s16 %p353, %rs278, 0; @%p353 bra $L__BB1_209; cvt.rn.f32.u64 %f1916, %rd536; fma.rn.f32 %f134, %f128, %f1916, 0fBF000000; setp.gt.u64 %p354, %rd530, %rd536; @%p354 bra $L__BB1_187; bra.uni $L__BB1_186; $L__BB1_187: shl.b64 %rd5807, %rd536, 2; add.s64 %rd537, %rd535, %rd5807; ld.f32 %f135, [%rd537]; add.s64 %rd5808, %rd536, 1; setp.gt.u64 %p355, %rd530, %rd5808; @%p355 bra $L__BB1_189; bra.uni $L__BB1_188; $L__BB1_189: ld.f32 %f136, [%rd537+4]; setp.gt.f32 %p356, %f136, %f130; setp.gt.f32 %p357, %f135, %f130; and.pred %p358, %p357, %p356; @%p358 bra $L__BB1_209; setp.lt.f32 %p359, %f135, %f129; setp.lt.f32 %p360, %f136, %f129; and.pred %p361, %p359, %p360; @%p361 bra $L__BB1_209; cvt.rn.f32.u64 %f5278, %rd536; fma.rn.f32 %f5277, %f128, %f5278, 0fBF000000; mul.f32 %f1917, %f1897, %f5277; mov.b32 %r1915, %f1917; mul.f32 %f139, %f1898, %f135; mov.b32 %r1916, %f139; cvt.u64.u32 %rd5809, %r1916; cvt.u64.u32 %rd5810, %r1915; add.f32 %f1918, %f128, %f5277; mul.f32 %f137, %f1897, %f1918; mov.b32 %r4841, %f137; mul.f32 %f1919, %f1898, %f136; mov.b32 %r1917, %f1919; cvt.u64.u32 %rd5811, %r1917; cvt.u64.u32 %rd5812, %r4841; bfi.b64 %rd5813, %rd5811, %rd5812, 32, 32; bfi.b64 %rd5814, %rd5809, %rd5810, 32, 32; cvt.u32.u64 %r155, %rd5814; mov.b32 %f138, %r155; sub.f32 %f140, %f137, %f138; sub.f32 %f141, %f1919, %f139; sub.f32 %f1920, %f118, %f138; sub.f32 %f1921, %f119, %f139; mul.f32 %f1922, %f141, %f1921; fma.rn.f32 %f142, %f140, %f1920, %f1922; mul.f32 %f1923, %f141, %f141; fma.rn.f32 %f1924, %f140, %f140, %f1923; add.f32 %f143, %f1924, 0f00000000; setp.gtu.f32 %p362, %f142, 0f00000000; mov.b64 {%r1918, %r4842}, %rd5814; mov.b64 {%r1919, %r157}, %rd5813; @%p362 bra $L__BB1_193; bra.uni $L__BB1_192; $L__BB1_193: setp.ltu.f32 %p363, %f142, %f143; @%p363 bra $L__BB1_195; bra.uni $L__BB1_194; $L__BB1_195: setp.eq.f32 %p364, %f143, 0f00000000; @%p364 bra $L__BB1_208; cvt.u32.u64 %r4752, %rd5814; mov.b32 %f5283, %r4752; div.rn.f32 %f1925, %f142, %f143; fma.rn.f32 %f5326, %f140, %f1925, %f5283; mov.b32 %r4841, %f5326; fma.rn.f32 %f5327, %f141, %f1925, %f139; mov.b32 %r4842, %f5327; bra.uni $L__BB1_197; $L__BB1_192: cvt.u32.u64 %r4841, %rd5814; mov.b32 %f5326, %r4841; mov.b32 %f5327, %r4842; bra.uni $L__BB1_197; $L__BB1_194: cvt.rn.f32.u64 %f5282, %rd536; add.f32 %f5281, %f128, %f5277; mul.f32 %f5326, %f1897, %f5281; mov.b32 %f5327, %r157; mov.u32 %r4842, %r157; $L__BB1_197: setp.eq.f32 %p365, %f118, %f5326; @%p365 bra $L__BB1_201; bra.uni $L__BB1_198; $L__BB1_201: setp.eq.f32 %p375, %f5327, %f119; mov.pred %p374, -1; mov.pred %p2910, %p374; @%p375 bra $L__BB1_205; and.b32 %r4751, %r147, 2147483647; mov.b32 %f5279, %r4751; setp.eq.f32 %p377, %f5279, 0f7F800000; and.b32 %r1921, %r4842, 2147483647; mov.b32 %f1932, %r1921; setp.eq.f32 %p378, %f1932, 0f7F800000; or.pred %p379, %p377, %p378; mov.pred %p2910, 0; @%p379 bra $L__BB1_205; sub.f32 %f1933, %f5327, %f119; abs.f32 %f151, %f1933; setp.le.f32 %p381, %f151, 0f34000000; mov.pred %p2910, %p374; @%p381 bra $L__BB1_205; abs.f32 %f1934, %f5327; abs.f32 %f1935, %f119; setp.gt.f32 %p382, %f1935, %f1934; selp.f32 %f1936, %f1935, %f1934, %p382; mul.f32 %f1937, %f1936, 0f34000000; setp.le.f32 %p2910, %f151, %f1937; bra.uni $L__BB1_205; $L__BB1_198: and.b32 %r4750, %r146, 2147483647; mov.b32 %f5276, %r4750; setp.eq.f32 %p367, %f5276, 0f7F800000; and.b32 %r1920, %r4841, 2147483647; mov.b32 %f1926, %r1920; setp.eq.f32 %p368, %f1926, 0f7F800000; or.pred %p369, %p367, %p368; mov.pred %p2910, 0; @%p369 bra $L__BB1_205; sub.f32 %f1927, %f5326, %f118; abs.f32 %f150, %f1927; setp.le.f32 %p370, %f150, 0f34000000; @%p370 bra $L__BB1_201; abs.f32 %f1928, %f5326; abs.f32 %f1929, %f118; setp.gt.f32 %p372, %f1929, %f1928; selp.f32 %f1930, %f1929, %f1928, %p372; mul.f32 %f1931, %f1930, 0f34000000; setp.gtu.f32 %p373, %f150, %f1931; @%p373 bra $L__BB1_205; bra.uni $L__BB1_201; $L__BB1_205: cvt.u64.u32 %rd5815, %r4842; cvt.u64.u32 %rd5816, %r4841; bfi.b64 %rd538, %rd5815, %rd5816, 32, 32; mov.b64 {%r1922, %r1923}, %rd538; selp.u64 %rd539, 1, 0, %p2910; mov.b32 %f1938, %r1922; sub.f32 %f1939, %f1938, %f118; mov.b32 %f1940, %r1923; sub.f32 %f1941, %f1940, %f119; mul.f32 %f1942, %f1941, %f1941; fma.rn.f32 %f1943, %f1939, %f1939, %f1942; add.f32 %f152, %f1943, 0f00000000; setp.geu.f32 %p383, %f152, %f5328; @%p383 bra $L__BB1_209; sqrt.rn.f32 %f1944, %f152; setp.gtu.f32 %p384, %f1944, %f6; mov.f32 %f5328, %f152; @%p384 bra $L__BB1_209; mov.b64 {%r4845, %r1924}, %rd539; mov.u32 %r4843, %r1922; mov.u32 %r4844, %r1923; mov.f32 %f5328, %f152; $L__BB1_209: add.s64 %rd536, %rd536, 1; setp.lt.u64 %p385, %rd536, %rd532; @%p385 bra $L__BB1_182; add.u64 %rd11344, %SPL, 32; st.local.u32 [%rd11344+8], %r4845; mov.b64 %rd5817, {%r4843, %r4844}; st.local.u64 [%rd11344], %rd5817; $L__BB1_211: add.u64 %rd11346, %SPL, 32; cvt.u64.u32 %rd5818, %r146; cvt.u64.u32 %rd5819, %r147; bfi.b64 %rd541, %rd5819, %rd5818, 32, 32; ld.local.v4.u32 {%r1928, %r1929, %r1930, %r1931}, [%rd11346]; mov.b64 %rd543, {%r1930, %r1931}; mov.b64 %rd542, {%r1928, %r1929}; mov.b32 {%rs279, %rs280}, %r1930; and.b16 %rs281, %rs279, 255; setp.eq.s16 %p386, %rs281, 2; cvt.u64.u16 %rd5820, %rs279; and.b64 %rd5821, %rd5820, 255; selp.b64 %rd5822, 2, %rd5821, %p386; and.b64 %rd5823, %rd543, 4294967040; or.b64 %rd5824, %rd5823, %rd5822; mov.b64 {%r1936, %r1937}, %rd5824; mov.b32 {%rs1018, %rs282}, %r1936; and.b16 %rs283, %rs1018, 255; setp.eq.s16 %p387, %rs283, 2; mov.u32 %r4848, 2; mov.u32 %r4846, 0; mov.u32 %r4847, %r4846; @%p387 bra $L__BB1_235; ld.global.u8 %rs284, [%rd35+64]; setp.eq.s16 %p388, %rs284, 0; shr.u64 %rd5825, %rd542, 32; cvt.u32.u64 %r1938, %rd5825; mov.b32 %f154, %r1938; @%p388 bra $L__BB1_216; mov.b64 {%r1939, %r1940}, %rd541; mov.b32 %f156, %r1940; mov.b32 %f155, %r1939; mov.b64 {%r1941, %r1942}, %rd527; mov.b64 {%r1943, %r1944}, %rd528; ld.global.u8 %rs20, [%rd35+65]; mov.b32 %f1945, %r1943; setp.gt.f32 %p390, %f155, %f1945; mov.b32 %f1946, %r1941; setp.lt.f32 %p391, %f155, %f1946; or.pred %p392, %p391, %p390; mov.pred %p2911, 0; @%p392 bra $L__BB1_215; setp.geu.f32 %p393, %f156, 0fFF7FFFFF; setp.leu.f32 %p394, %f156, 0f7F7FFFFF; and.pred %p2911, %p394, %p393; $L__BB1_215: setp.ge.f32 %p395, %f119, %f154; setp.le.f32 %p396, %f119, %f154; setp.eq.s16 %p397, %rs20, 0; selp.u32 %r1945, -1, 0, %p395; selp.u32 %r1946, -1, 0, %p396; selp.b32 %r1947, %r1946, %r1945, %p397; and.b32 %r1948, %r1947, 1; setp.eq.b32 %p398, %r1948, 1; and.pred %p399, %p398, %p2911; selp.u16 %rs1018, 1, 0, %p399; $L__BB1_216: cvt.u32.u64 %r1949, %rd542; mov.b32 %f1947, %r1949; mul.f32 %f1948, %f117, %f1947; ld.global.f32 %f1949, [%rd35+252]; mul.f32 %f1950, %f1949, %f154; sub.f32 %f1951, %f1948, %f1950; mul.f32 %f1952, %f1949, %f1947; fma.rn.f32 %f1953, %f117, %f154, %f1952; add.f32 %f1954, %f115, %f1951; mov.b32 %r1950, %f1954; add.f32 %f1955, %f116, %f1953; mov.b32 %r1951, %f1955; cvt.u64.u32 %rd5826, %r1951; cvt.u64.u32 %rd5827, %r1950; cvt.u64.u16 %rd5828, %rs1018; bfi.b64 %rd5829, %rd5826, %rd5827, 32, 32; and.b64 %rd5830, %rd5828, 255; mov.b64 {%r4846, %r4847}, %rd5829; mov.b64 {%r4848, %r1952}, %rd5830; bra.uni $L__BB1_235; $L__BB1_217: add.s64 %rd11557, %rd7, 8; ld.global.f32 %f157, [%rd35+256]; sub.f32 %f1956, %f2, %f157; ld.global.f32 %f158, [%rd35+260]; sub.f32 %f1957, %f3, %f158; ld.global.f32 %f159, [%rd35+252]; ld.global.f32 %f160, [%rd35+248]; mul.f32 %f1958, %f1957, %f159; fma.rn.f32 %f161, %f1956, %f160, %f1958; mul.f32 %f1959, %f1956, %f159; mul.f32 %f1960, %f1957, %f160; sub.f32 %f162, %f1960, %f1959; ld.global.u32 %rd5832, [%rd35+8]; ld.global.u32 %rd5833, [%rd35+12]; bfi.b64 %rd5834, %rd5833, %rd5832, 32, 32; cvt.u32.u64 %r1953, %rd5834; mov.b32 %f1961, %r1953; shr.u64 %rd5835, %rd5834, 32; cvt.u32.u64 %r1954, %rd5835; mov.b32 %f1962, %r1954; neg.f32 %f1963, %f1961; neg.f32 %f1964, %f1962; sub.f32 %f163, %f1963, %f161; sub.f32 %f164, %f1964, %f162; sub.f32 %f165, %f161, %f1961; sub.f32 %f166, %f162, %f1962; setp.ge.f32 %p400, %f163, 0f00000000; selp.f32 %f1965, %f163, 0f00000000, %p400; setp.ge.f32 %p401, %f164, 0f00000000; selp.f32 %f1966, %f164, 0f00000000, %p401; setp.ge.f32 %p402, %f165, 0f00000000; selp.f32 %f1967, %f165, 0f00000000, %p402; setp.ge.f32 %p403, %f166, 0f00000000; selp.f32 %f1968, %f166, 0f00000000, %p403; sub.f32 %f167, %f1965, %f1967; mov.b32 %r1955, %f167; sub.f32 %f168, %f1966, %f1968; mov.b32 %r1956, %f168; cvt.u64.u32 %rd5836, %r1956; cvt.u64.u32 %rd5837, %r1955; bfi.b64 %rd5838, %rd5836, %rd5837, 32, 32; st.local.u64 [%rd7], %rd5838; mov.u64 %rd11564, 2; mov.u64 %rd11558, %rd7; mov.u64 %rd11559, %rd7; mov.u64 %rd11560, %rd5241; mov.u64 %rd11561, %rd7; mov.u64 %rd11562, %rd7; mov.u64 %rd11563, %rd5241; $L__BB1_218: setp.eq.s64 %p404, %rd11564, 0; @%p404 bra $L__BB1_221; add.s64 %rd11564, %rd11564, -1; add.s64 %rd5839, %rd11561, 8; setp.eq.s64 %p405, %rd11561, %rd11557; selp.b64 %rd11557, %rd5839, %rd11557, %p405; add.s64 %rd5840, %rd11558, 8; selp.b64 %rd11558, %rd5840, %rd11558, %p405; add.s64 %rd5841, %rd11559, 8; selp.b64 %rd11559, %rd5841, %rd11559, %p405; add.s64 %rd5842, %rd11560, 8; selp.b64 %rd11560, %rd5842, %rd11560, %p405; selp.b64 %rd5843, %rd5840, %rd11561, %p405; selp.b64 %rd5844, %rd5841, %rd11562, %p405; selp.b64 %rd5845, %rd5842, %rd11563, %p405; setp.eq.s64 %p406, %rd11564, 0; add.s64 %rd5846, %rd5843, 4; add.s64 %rd5847, %rd5844, 4; add.s64 %rd5848, %rd5845, 4; selp.b64 %rd11561, %rd5843, %rd5846, %p406; selp.b64 %rd11562, %rd5844, %rd5847, %p406; selp.b64 %rd11563, %rd5845, %rd5848, %p406; ld.local.f32 %f1969, [%rd5844]; setp.eq.f32 %p407, %f1969, 0f00000000; @%p407 bra $L__BB1_218; add.f32 %f1970, %f161, %f167; mov.b32 %r1957, %f1970; add.f32 %f1971, %f162, %f168; mov.b32 %r1958, %f1971; cvt.u64.u32 %rd5851, %r1958; cvt.u64.u32 %rd5852, %r1957; bfi.b64 %rd11568, %rd5851, %rd5852, 32, 32; mov.u64 %rd11567, 0; bra.uni $L__BB1_234; $L__BB1_221: setp.lt.f32 %p408, %f163, %f165; mov.f32 %f5329, 0fFF7FFFFF; @%p408 bra $L__BB1_224; bra.uni $L__BB1_222; $L__BB1_224: setp.leu.f32 %p413, %f165, 0fFF7FFFFF; mov.pred %p2912, 0; @%p413 bra $L__BB1_226; mov.f32 %f5329, %f165; bra.uni $L__BB1_226; $L__BB1_222: setp.leu.f32 %p410, %f163, 0fFF7FFFFF; mov.pred %p2912, 0; @%p410 bra $L__BB1_226; mov.pred %p2912, -1; mov.f32 %f5329, %f163; $L__BB1_226: setp.lt.f32 %p415, %f164, %f166; @%p415 bra $L__BB1_229; bra.uni $L__BB1_227; $L__BB1_229: setp.gt.f32 %p417, %f166, %f5329; @%p417 bra $L__BB1_232; bra.uni $L__BB1_230; $L__BB1_232: add.u64 %rd5859, %SPL, 32; mov.u64 %rd5860, 0; st.local.u64 [%rd5859], %rd5860; neg.f32 %f5331, %f166; add.s64 %rd11566, %rd5859, 4; bra.uni $L__BB1_233; $L__BB1_227: setp.leu.f32 %p416, %f164, %f5329; @%p416 bra $L__BB1_230; add.u64 %rd5854, %SPL, 32; mov.u64 %rd5855, 0; st.local.u64 [%rd5854], %rd5855; add.s64 %rd11566, %rd5854, 4; mov.f32 %f5329, %f164; bra.uni $L__BB1_231; $L__BB1_230: add.u64 %rd11566, %SPL, 32; mov.u64 %rd5857, 0; st.local.u64 [%rd11566], %rd5857; neg.f32 %f5331, %f5329; not.pred %p418, %p2912; @%p418 bra $L__BB1_233; $L__BB1_231: mov.f32 %f5331, %f5329; $L__BB1_233: st.local.f32 [%rd11566], %f5331; add.u64 %rd5864, %SPL, 32; ld.local.u64 %rd5865, [%rd5864]; cvt.u32.u64 %r1959, %rd5865; mov.b32 %f1974, %r1959; shr.u64 %rd5866, %rd5865, 32; cvt.u32.u64 %r1960, %rd5866; mov.b32 %f1975, %r1960; add.f32 %f1976, %f161, %f1974; add.f32 %f1977, %f162, %f1975; mov.b32 %r1961, %f1976; mov.b32 %r1962, %f1977; cvt.u64.u32 %rd5867, %r1962; cvt.u64.u32 %rd5868, %r1961; bfi.b64 %rd11568, %rd5867, %rd5868, 32, 32; mov.u64 %rd11567, 1; $L__BB1_234: mov.u64 %rd11035, 0; cvt.u32.u64 %r1963, %rd11568; mov.b32 %f1978, %r1963; shr.u64 %rd5869, %rd11568, 32; cvt.u32.u64 %r1964, %rd5869; mov.b32 %f1979, %r1964; mul.f32 %f1980, %f160, %f1978; mul.f32 %f1981, %f159, %f1979; sub.f32 %f1982, %f1980, %f1981; mul.f32 %f1983, %f160, %f1979; fma.rn.f32 %f1984, %f159, %f1978, %f1983; add.f32 %f1985, %f157, %f1982; mov.b32 %r1965, %f1985; add.f32 %f1986, %f158, %f1984; mov.b32 %r1966, %f1986; cvt.u64.u32 %rd5870, %r1966; cvt.u64.u32 %rd5871, %r1965; bfi.b64 %rd5872, %rd5870, %rd5871, 32, 32; or.b64 %rd5873, %rd11035, %rd5872; mov.b64 {%r4846, %r4847}, %rd5873; mov.b64 {%r4848, %r1967}, %rd11567; bra.uni $L__BB1_235; $L__BB1_34: ld.local.u32 %r1692, [%rd45+24]; setp.eq.s32 %p155, %r1692, 0; @%p155 bra $L__BB1_47; setp.ne.s32 %p156, %r1692, 1; @%p156 bra $L__BB1_60; add.s64 %rd65, %rd11391, 1; or.b64 %rd5314, %rd65, %rd46; and.b64 %rd5315, %rd5314, -4294967296; setp.eq.s64 %p157, %rd5315, 0; @%p157 bra $L__BB1_38; rem.u64 %rd11395, %rd65, %rd46; bra.uni $L__BB1_39; $L__BB1_47: setp.eq.s64 %p164, %rd11391, 0; selp.b64 %rd112, %rd46, %rd11391, %p164; add.s64 %rd5356, %rd112, -1; setp.gt.u64 %p165, %rd46, %rd5356; @%p165 bra $L__BB1_49; bra.uni $L__BB1_48; $L__BB1_49: shl.b64 %rd5357, %rd112, 3; add.s64 %rd5358, %rd47, %rd5357; ld.u32 %rd5359, [%rd5358+-8]; ld.u32 %rd5360, [%rd5358+-4]; bfi.b64 %rd113, %rd5360, %rd5359, 32, 32; or.b64 %rd5361, %rd112, %rd46; and.b64 %rd5362, %rd5361, -4294967296; setp.eq.s64 %p166, %rd5362, 0; @%p166 bra $L__BB1_51; rem.u64 %rd11412, %rd112, %rd46; bra.uni $L__BB1_52; $L__BB1_165: ld.u32 %r1878, [%rd178+76]; cvt.u64.u32 %rd5736, %r1878; setp.le.u64 %p332, %rd169, %rd5736; mul.wide.u32 %rd5737, %r1878, 12; add.s64 %rd5738, %rd170, %rd5737; setp.eq.s64 %p333, %rd5738, 0; or.pred %p334, %p332, %p333; selp.b32 %r40, %r40, %r4802, %p334; selp.b32 %r39, %r39, %r4801, %p334; selp.b32 %r38, %r38, %r4800, %p334; selp.b32 %r42, %r42, %r4815, %p334; selp.b32 %r43, %r43, %r92, %p334; $L__BB1_71: mov.u32 %r44, %r45; setp.eq.s32 %p176, %r44, 0; @%p176 bra $L__BB1_172; cvt.u64.u32 %rd5422, %r44; add.s64 %rd5423, %rd5422, -1; cvt.u32.u64 %r45, %rd5423; st.local.u32 [%rd9+512], %r45; mul.wide.u32 %rd5424, %r44, 8; add.s64 %rd5425, %rd9, %rd5424; ld.local.u32 %rd176, [%rd5425+-4]; ld.local.u32 %rd5426, [%rd5425+-8]; shl.b64 %rd5427, %rd5426, 32; or.b64 %rd175, %rd5427, 1; mov.b64 {%r1734, %r1735}, %rd176; mov.b32 %f1689, %r1734; neg.f32 %f1690, %f1689; mov.b32 %f1691, %r43; setp.le.f32 %p177, %f1691, %f1690; @%p177 bra $L__BB1_71; mov.b64 {%r1736, %r1737}, %rd175; cvt.u64.u32 %rd177, %r1737; setp.gt.u64 %p178, %rd166, %rd177; @%p178 bra $L__BB1_75; bra.uni $L__BB1_74; $L__BB1_75: mul.lo.s64 %rd5428, %rd177, 96; add.s64 %rd178, %rd168, %rd5428; ld.u8 %rs219, [%rd178+88]; and.b16 %rs220, %rs219, 1; setp.eq.b16 %p180, %rs220, 1; mov.pred %p2909, 0; xor.pred %p181, %p180, %p2909; not.pred %p182, %p181; @%p182 bra $L__BB1_77; ld.v4.u32 {%r1738, %r1739, %r1740, %r1741}, [%rd178+64]; cvt.u64.u32 %rd5429, %r1738; setp.gt.u64 %p184, %rd169, %rd5429; mul.wide.u32 %rd5430, %r1738, 12; add.s64 %rd5431, %rd170, %rd5430; selp.b64 %rd5432, %rd5431, 0, %p184; setp.eq.s64 %p185, %rd5432, 0; add.s64 %rd5433, %rd5432, 8; selp.b64 %rd11433, 0, %rd5433, %p185; cvt.u64.u32 %rd5434, %r1739; setp.gt.u64 %p186, %rd169, %rd5434; mul.wide.u32 %rd5435, %r1739, 12; add.s64 %rd5436, %rd170, %rd5435; selp.b64 %rd5437, %rd5436, 0, %p186; setp.eq.s64 %p187, %rd5437, 0; add.s64 %rd5438, %rd5437, 8; selp.b64 %rd11432, 0, %rd5438, %p187; ld.u32 %r1745, [%rd178+72]; cvt.u64.u32 %rd5439, %r1745; setp.gt.u64 %p188, %rd169, %rd5439; mul.wide.u32 %rd5440, %r1745, 12; add.s64 %rd5441, %rd170, %rd5440; selp.b64 %rd5442, %rd5441, 0, %p188; setp.eq.s64 %p189, %rd5442, 0; add.s64 %rd5443, %rd5442, 8; selp.b64 %rd11431, 0, %rd5443, %p189; cvt.u64.u32 %rd5444, %r1741; setp.gt.u64 %p190, %rd169, %rd5444; mul.wide.u32 %rd5445, %r1741, 12; add.s64 %rd5446, %rd170, %rd5445; selp.b64 %rd5447, %rd5446, 0, %p190; setp.eq.s64 %p191, %rd5447, 0; add.s64 %rd5448, %rd5447, 8; selp.b64 %rd11430, 0, %rd5448, %p191; mov.pred %p2909, -1; $L__BB1_77: mov.b32 %f5234, %r43; ld.v4.f32 {%f1692, %f1693, %f1694, %f1695}, [%rd178]; sub.f32 %f1700, %f1692, %f49; sub.f32 %f1701, %f1693, %f49; sub.f32 %f1702, %f1694, %f49; sub.f32 %f1703, %f1695, %f49; ld.v4.f32 {%f1704, %f1705, %f1706, %f1707}, [%rd178+16]; sub.f32 %f1712, %f1704, %f50; sub.f32 %f1713, %f1705, %f50; sub.f32 %f1714, %f1706, %f50; sub.f32 %f1715, %f1707, %f50; ld.v4.f32 {%f1716, %f1717, %f1718, %f1719}, [%rd178+32]; sub.f32 %f1724, %f49, %f1716; sub.f32 %f1725, %f49, %f1717; sub.f32 %f1726, %f49, %f1718; sub.f32 %f1727, %f49, %f1719; ld.v4.f32 {%f1728, %f1729, %f1730, %f1731}, [%rd178+48]; sub.f32 %f1736, %f50, %f1728; sub.f32 %f1737, %f50, %f1729; sub.f32 %f1738, %f50, %f1730; sub.f32 %f1739, %f50, %f1731; setp.ge.f32 %p192, %f1700, %f1724; selp.f32 %f1740, %f1700, %f1724, %p192; setp.ge.f32 %p193, %f1701, %f1725; selp.f32 %f1741, %f1701, %f1725, %p193; setp.ge.f32 %p194, %f1702, %f1726; selp.f32 %f1742, %f1702, %f1726, %p194; setp.ge.f32 %p195, %f1703, %f1727; selp.f32 %f1743, %f1703, %f1727, %p195; setp.ge.f32 %p196, %f1712, %f1736; selp.f32 %f1744, %f1712, %f1736, %p196; setp.ge.f32 %p197, %f1713, %f1737; selp.f32 %f1745, %f1713, %f1737, %p197; setp.ge.f32 %p198, %f1714, %f1738; selp.f32 %f1746, %f1714, %f1738, %p198; setp.ge.f32 %p199, %f1715, %f1739; selp.f32 %f1747, %f1715, %f1739, %p199; setp.ge.f32 %p200, %f1740, 0f00000000; selp.f32 %f1748, %f1740, 0f00000000, %p200; setp.ge.f32 %p201, %f1741, 0f00000000; selp.f32 %f1749, %f1741, 0f00000000, %p201; setp.ge.f32 %p202, %f1742, 0f00000000; selp.f32 %f1750, %f1742, 0f00000000, %p202; setp.ge.f32 %p203, %f1743, 0f00000000; selp.f32 %f1751, %f1743, 0f00000000, %p203; mov.b32 %r1746, %f1748; mov.b32 %r1747, %f1749; mov.b32 %r1748, %f1750; mov.b32 %r1749, %f1751; cvt.u64.u32 %rd5449, %r1749; cvt.u64.u32 %rd5450, %r1747; cvt.u64.u32 %rd5451, %r1746; cvt.u64.u32 %rd5452, %r1748; bfi.b64 %rd5453, %rd5449, %rd5452, 32, 32; bfi.b64 %rd5454, %rd5450, %rd5451, 32, 32; setp.ge.f32 %p204, %f1744, 0f00000000; selp.f32 %f1752, %f1744, 0f00000000, %p204; setp.ge.f32 %p205, %f1745, 0f00000000; selp.f32 %f1753, %f1745, 0f00000000, %p205; setp.ge.f32 %p206, %f1746, 0f00000000; selp.f32 %f1754, %f1746, 0f00000000, %p206; setp.ge.f32 %p207, %f1747, 0f00000000; selp.f32 %f1755, %f1747, 0f00000000, %p207; mov.b32 %r1750, %f1752; mov.b32 %r1751, %f1753; mov.b32 %r1752, %f1754; mov.b32 %r1753, %f1755; cvt.u64.u32 %rd5455, %r1753; cvt.u64.u32 %rd5456, %r1751; cvt.u64.u32 %rd5457, %r1750; cvt.u64.u32 %rd5458, %r1752; bfi.b64 %rd5459, %rd5455, %rd5458, 32, 32; bfi.b64 %rd5460, %rd5456, %rd5457, 32, 32; mov.b64 {%r1754, %r1755}, %rd5454; mov.b64 {%r1756, %r1757}, %rd5453; cvt.u64.u32 %rd5461, %r1757; cvt.u64.u32 %rd5462, %r1755; cvt.u64.u32 %rd5463, %r1756; bfi.b64 %rd5464, %rd5461, %rd5463, 32, 32; mov.b64 {%r1758, %r1759}, %rd5464; bfi.b64 %rd5465, %rd5462, %rd5451, 32, 32; mov.b64 {%r1760, %r1761}, %rd5465; mov.b32 %f1756, %r1760; mov.b32 %f1757, %r1761; mov.b32 %f1758, %r1758; mov.b32 %f1759, %r1759; mov.b32 %f1760, %r1754; mov.b32 %f1761, %r1755; mov.b32 %f1762, %r1756; mov.b32 %f1763, %r1757; mov.b64 {%r1762, %r1763}, %rd5460; mov.b64 {%r1764, %r1765}, %rd5459; cvt.u64.u32 %rd5466, %r1765; cvt.u64.u32 %rd5467, %r1763; cvt.u64.u32 %rd5468, %r1764; bfi.b64 %rd5469, %rd5466, %rd5468, 32, 32; mov.b64 {%r1766, %r1767}, %rd5469; bfi.b64 %rd5470, %rd5467, %rd5457, 32, 32; mov.b64 {%r1768, %r1769}, %rd5470; mov.b32 %f1764, %r1768; mov.b32 %f1765, %r1769; mov.b32 %f1766, %r1766; mov.b32 %f1767, %r1767; mov.b32 %f1768, %r1762; mov.b32 %f1769, %r1763; mov.b32 %f1770, %r1764; mov.b32 %f1771, %r1765; mul.f32 %f1772, %f1768, %f1764; mul.f32 %f1773, %f1769, %f1765; mul.f32 %f1774, %f1770, %f1766; mul.f32 %f1775, %f1771, %f1767; fma.rn.f32 %f1776, %f1760, %f1756, %f1772; fma.rn.f32 %f1777, %f1761, %f1757, %f1773; fma.rn.f32 %f1778, %f1762, %f1758, %f1774; fma.rn.f32 %f1779, %f1763, %f1759, %f1775; add.f32 %f1780, %f1776, 0f00000000; add.f32 %f1781, %f1777, 0f00000000; add.f32 %f1782, %f1778, 0f00000000; add.f32 %f1783, %f1779, 0f00000000; sqrt.rn.f32 %f1784, %f1780; sqrt.rn.f32 %f1785, %f1781; sqrt.rn.f32 %f1786, %f1782; sqrt.rn.f32 %f1787, %f1783; mov.b32 %r1770, %f1784; mov.b32 %r1771, %f1785; mov.b32 %r1772, %f1786; mov.b32 %r1773, %f1787; cvt.u64.u32 %rd5471, %r1773; cvt.u64.u32 %rd5472, %r1771; cvt.u64.u32 %rd5473, %r1770; cvt.u64.u32 %rd5474, %r1772; bfi.b64 %rd11539, %rd5471, %rd5474, 32, 32; mov.b64 {%r1774, %r1775}, %rd11539; bfi.b64 %rd11538, %rd5472, %rd5473, 32, 32; mov.b64 {%r1776, %r1777}, %rd11538; mov.b32 %f1788, %r1776; mov.b32 %f1789, %r1777; mov.b32 %f1790, %r1774; mov.b32 %f1791, %r1775; setp.lt.f32 %p208, %f1788, %f5234; setp.lt.f32 %p209, %f1789, %f5234; setp.lt.f32 %p210, %f1790, %f5234; setp.lt.f32 %p211, %f1791, %f5234; selp.u32 %r1778, 1, 0, %p208; selp.u32 %r1779, -1, 0, %p209; bfi.b32 %r1780, %r1779, %r1778, 8, 1; selp.u32 %r1781, -1, 0, %p210; bfi.b32 %r1782, %r1781, %r1780, 16, 1; selp.u32 %r1783, -1, 0, %p211; bfi.b32 %r1784, %r1783, %r1782, 24, 1; cvt.u64.u32 %rd5475, %r1784; mov.b64 {%r1785, %r1786}, %rd5475; mov.b32 {%rs221, %rs222}, %r1785; and.b16 %rs223, %rs221, 1; shr.u16 %rs224, %rs221, 7; and.b16 %rs225, %rs224, 2; or.b16 %rs226, %rs225, %rs223; shl.b16 %rs227, %rs222, 2; and.b16 %rs228, %rs227, 4; or.b16 %rs229, %rs226, %rs228; shr.u16 %rs230, %rs222, 5; and.b16 %rs231, %rs230, 8; or.b16 %rs232, %rs229, %rs231; cvt.u64.u16 %rd189, %rs232; @%p2909 bra $L__BB1_79; bra.uni $L__BB1_78; $L__BB1_79: mov.u64 %rd192, 1; st.local.v2.u64 [%rd8], {%rd11433, %rd11432}; st.local.v2.u64 [%rd8+16], {%rd11431, %rd11430}; mov.f32 %f1793, 0f00000000; st.local.v4.f32 [%rd24], {%f1793, %f1793, %f1793, %f1793}; mov.u32 %r1797, 4; st.local.u32 [%rd7+16], %r1797; st.local.u32 [%rd7+52], %r1797; st.local.u32 [%rd7+88], %r1797; st.local.u32 [%rd7+124], %r1797; $L__BB1_80: mov.u64 %rd11334, 1; add.s64 %rd5481, %rd192, -1; cvt.u32.u64 %r1798, %rd5481; shl.b64 %rd5483, %rd11334, %r1798; and.b64 %rd5484, %rd5483, %rd189; setp.eq.s64 %p212, %rd5484, 0; @%p212 bra $L__BB1_133; shl.b64 %rd5485, %rd192, 3; add.s64 %rd5486, %rd8, %rd5485; ld.local.u64 %rd193, [%rd5486+-8]; setp.eq.s64 %p213, %rd193, 0; @%p213 bra $L__BB1_133; ld.u32 %r46, [%rd193]; cvt.u64.u32 %rd194, %r46; ld.global.u64 %rd5487, [%rd35+112]; setp.gt.u64 %p214, %rd5487, %rd194; @%p214 bra $L__BB1_84; bra.uni $L__BB1_83; $L__BB1_84: ld.global.u64 %rd5488, [%rd35+104]; mul.lo.s64 %rd5489, %rd194, 12; add.s64 %rd195, %rd5488, %rd5489; ld.u32 %rd196, [%rd195+8]; ld.u32 %rd197, [%rd195]; ld.global.u64 %rd198, [%rd35+96]; setp.gt.u64 %p215, %rd198, %rd197; @%p215 bra $L__BB1_86; bra.uni $L__BB1_85; $L__BB1_86: ld.global.u64 %rd199, [%rd35+88]; shl.b64 %rd5490, %rd197, 3; add.s64 %rd5491, %rd199, %rd5490; ld.u32 %rd5492, [%rd5491]; ld.u32 %rd5493, [%rd5491+4]; bfi.b64 %rd200, %rd5493, %rd5492, 32, 32; ld.u32 %rd201, [%rd195+4]; setp.gt.u64 %p216, %rd198, %rd201; @%p216 bra $L__BB1_88; bra.uni $L__BB1_87; $L__BB1_88: setp.gt.u64 %p217, %rd198, %rd196; @%p217 bra $L__BB1_90; bra.uni $L__BB1_89; $L__BB1_90: shl.b64 %rd5494, %rd201, 3; add.s64 %rd5495, %rd199, %rd5494; shl.b64 %rd5496, %rd196, 3; add.s64 %rd5497, %rd199, %rd5496; cvt.u32.u64 %r1799, %rd200; mov.b32 %f51, %r1799; shr.u64 %rd5498, %rd200, 32; cvt.u32.u64 %r1800, %rd5498; mov.b32 %f52, %r1800; ld.u32 %rd5499, [%rd5495]; ld.u32 %rd5500, [%rd5495+4]; bfi.b64 %rd202, %rd5500, %rd5499, 32, 32; cvt.u32.u64 %r1801, %rd202; shr.u64 %rd5501, %rd202, 32; cvt.u32.u64 %r1802, %rd5501; mov.b32 %f53, %r1801; sub.f32 %f54, %f53, %f51; mov.b32 %f5320, %r1802; sub.f32 %f56, %f5320, %f52; ld.u32 %rd5502, [%rd5497]; ld.u32 %rd5503, [%rd5497+4]; bfi.b64 %rd203, %rd5503, %rd5502, 32, 32; cvt.u32.u64 %r1803, %rd203; shr.u64 %rd5504, %rd203, 32; cvt.u32.u64 %r1804, %rd5504; mov.b32 %f57, %r1803; sub.f32 %f58, %f57, %f51; mov.b32 %f59, %r1804; sub.f32 %f60, %f59, %f52; sub.f32 %f61, %f49, %f51; sub.f32 %f62, %f50, %f52; mul.f32 %f1794, %f56, %f62; fma.rn.f32 %f63, %f54, %f61, %f1794; mul.f32 %f1795, %f60, %f62; fma.rn.f32 %f64, %f58, %f61, %f1795; setp.le.f32 %p218, %f63, 0f00000000; setp.le.f32 %p219, %f64, 0f00000000; and.pred %p220, %p218, %p219; @%p220 bra $L__BB1_128; bra.uni $L__BB1_91; $L__BB1_128: add.u64 %rd11524, %SP, 552; add.u64 %rd11530, %SP, 0; st.local.u64 [%rd1], %rd200; mov.u64 %rd11535, 2; mov.u64 %rd11521, %rd23; mov.u64 %rd11522, %rd10; mov.u64 %rd11523, %rd10; mov.u64 %rd11525, %rd10; mov.u64 %rd11526, %rd10; mov.u64 %rd11527, %rd11524; mov.u64 %rd11528, %rd1; mov.u64 %rd11529, %rd1; mov.u64 %rd11531, %rd1; mov.u64 %rd11532, %rd1; mov.u64 %rd11533, %rd11530; mov.u64 %rd11534, %rd17; $L__BB1_129: setp.eq.s64 %p273, %rd11535, 0; mov.u64 %rd11536, 1; @%p273 bra $L__BB1_131; add.s64 %rd11535, %rd11535, -1; add.s64 %rd5649, %rd11522, 8; setp.eq.s64 %p274, %rd11525, %rd11521; selp.b64 %rd5650, %rd5649, %rd11525, %p274; add.s64 %rd5651, %rd11523, 8; selp.b64 %rd5652, %rd5651, %rd11526, %p274; add.s64 %rd5653, %rd11524, 8; selp.b64 %rd5654, %rd5653, %rd11527, %p274; mov.u64 %rd11536, 0; setp.eq.s64 %p275, %rd11535, 0; add.s64 %rd5655, %rd5650, 4; add.s64 %rd5656, %rd5652, 4; add.s64 %rd5657, %rd5654, 4; selp.b64 %rd429, %rd5650, %rd5655, %p275; selp.b64 %rd11526, %rd5652, %rd5656, %p275; selp.b64 %rd11527, %rd5654, %rd5657, %p275; selp.b64 %rd11522, %rd5649, %rd11522, %p274; selp.b64 %rd11523, %rd5651, %rd11523, %p274; selp.b64 %rd11524, %rd5653, %rd11524, %p274; add.s64 %rd5658, %rd11525, 8; selp.b64 %rd11521, %rd5658, %rd11521, %p274; add.s64 %rd5659, %rd11531, 8; setp.eq.s64 %p276, %rd11528, %rd11534; selp.b64 %rd5660, %rd5659, %rd11528, %p276; add.s64 %rd5661, %rd11532, 8; selp.b64 %rd5662, %rd5661, %rd11529, %p276; add.s64 %rd5663, %rd11533, 8; selp.b64 %rd5664, %rd5663, %rd11530, %p276; selp.b64 %rd11531, %rd5659, %rd11531, %p276; selp.b64 %rd11532, %rd5661, %rd11532, %p276; selp.b64 %rd11533, %rd5663, %rd11533, %p276; add.s64 %rd5665, %rd11528, 8; selp.b64 %rd11534, %rd5665, %rd11534, %p276; add.s64 %rd5666, %rd5660, 4; add.s64 %rd5667, %rd5662, 4; add.s64 %rd5668, %rd5664, 4; selp.b64 %rd11528, %rd5660, %rd5666, %p275; selp.b64 %rd11529, %rd5662, %rd5667, %p275; selp.b64 %rd11530, %rd5664, %rd5668, %p275; ld.local.f32 %f1861, [%rd5662]; ld.local.f32 %f1862, [%rd5652]; setp.eq.f32 %p277, %f1862, %f1861; mov.u64 %rd11525, %rd429; @%p277 bra $L__BB1_129; $L__BB1_131: cvt.u32.u64 %r4735, %rd200; mov.u64 %rd11012, 0; or.b64 %rd5670, %rd11012, %rd200; mov.b64 {%r1846, %r1847}, %rd5670; mov.b64 {%r1848, %r1849}, %rd11536; cvt.u32.u64 %r1851, %rd11012; or.b32 %r4797, %r1851, %r4735; mov.u32 %r4798, 0; mov.b32 %f5324, %r1847; mov.b32 {%rs1017, %rs251}, %r1848; mov.u32 %r4799, %r4798; bra.uni $L__BB1_132; $L__BB1_91: cvt.u32.u64 %r4701, %rd202; mov.b32 %f5222, %r4701; sub.f32 %f65, %f49, %f5222; sub.f32 %f66, %f50, %f5320; mul.f32 %f1796, %f56, %f66; fma.rn.f32 %f67, %f54, %f65, %f1796; mul.f32 %f1797, %f60, %f66; fma.rn.f32 %f68, %f58, %f65, %f1797; setp.ge.f32 %p221, %f67, 0f00000000; setp.le.f32 %p222, %f68, %f67; and.pred %p223, %p222, %p221; @%p223 bra $L__BB1_124; bra.uni $L__BB1_92; $L__BB1_124: add.u64 %rd11508, %SP, 552; add.u64 %rd11514, %SP, 0; st.local.u64 [%rd2], %rd202; mov.u64 %rd11519, 2; mov.u64 %rd11505, %rd23; mov.u64 %rd11506, %rd10; mov.u64 %rd11507, %rd10; mov.u64 %rd11509, %rd10; mov.u64 %rd11510, %rd10; mov.u64 %rd11511, %rd11508; mov.u64 %rd11512, %rd2; mov.u64 %rd11513, %rd2; mov.u64 %rd11515, %rd2; mov.u64 %rd11516, %rd2; mov.u64 %rd11517, %rd11514; mov.u64 %rd11518, %rd18; $L__BB1_125: setp.eq.s64 %p268, %rd11519, 0; mov.u64 %rd11520, 1; @%p268 bra $L__BB1_127; add.s64 %rd11519, %rd11519, -1; add.s64 %rd5622, %rd11506, 8; setp.eq.s64 %p269, %rd11509, %rd11505; selp.b64 %rd5623, %rd5622, %rd11509, %p269; add.s64 %rd5624, %rd11507, 8; selp.b64 %rd5625, %rd5624, %rd11510, %p269; add.s64 %rd5626, %rd11508, 8; selp.b64 %rd5627, %rd5626, %rd11511, %p269; mov.u64 %rd11520, 0; setp.eq.s64 %p270, %rd11519, 0; add.s64 %rd5628, %rd5623, 4; add.s64 %rd5629, %rd5625, 4; add.s64 %rd5630, %rd5627, 4; selp.b64 %rd391, %rd5623, %rd5628, %p270; selp.b64 %rd11510, %rd5625, %rd5629, %p270; selp.b64 %rd11511, %rd5627, %rd5630, %p270; selp.b64 %rd11506, %rd5622, %rd11506, %p269; selp.b64 %rd11507, %rd5624, %rd11507, %p269; selp.b64 %rd11508, %rd5626, %rd11508, %p269; add.s64 %rd5631, %rd11509, 8; selp.b64 %rd11505, %rd5631, %rd11505, %p269; add.s64 %rd5632, %rd11515, 8; setp.eq.s64 %p271, %rd11512, %rd11518; selp.b64 %rd5633, %rd5632, %rd11512, %p271; add.s64 %rd5634, %rd11516, 8; selp.b64 %rd5635, %rd5634, %rd11513, %p271; add.s64 %rd5636, %rd11517, 8; selp.b64 %rd5637, %rd5636, %rd11514, %p271; selp.b64 %rd11515, %rd5632, %rd11515, %p271; selp.b64 %rd11516, %rd5634, %rd11516, %p271; selp.b64 %rd11517, %rd5636, %rd11517, %p271; add.s64 %rd5638, %rd11512, 8; selp.b64 %rd11518, %rd5638, %rd11518, %p271; add.s64 %rd5639, %rd5633, 4; add.s64 %rd5640, %rd5635, 4; add.s64 %rd5641, %rd5637, 4; selp.b64 %rd11512, %rd5633, %rd5639, %p270; selp.b64 %rd11513, %rd5635, %rd5640, %p270; selp.b64 %rd11514, %rd5637, %rd5641, %p270; ld.local.f32 %f1859, [%rd5635]; ld.local.f32 %f1860, [%rd5625]; setp.eq.f32 %p272, %f1860, %f1859; mov.u64 %rd11509, %rd391; @%p272 bra $L__BB1_125; $L__BB1_127: cvt.u32.u64 %r4734, %rd202; mov.u64 %rd11011, 0; or.b64 %rd5643, %rd11011, %rd202; mov.b64 {%r1838, %r1839}, %rd5643; mov.b64 {%r1840, %r1841}, %rd11520; cvt.u32.u64 %r1843, %rd11011; or.b32 %r4797, %r1843, %r4734; mov.u32 %r4798, 0; mov.b32 %f5324, %r1839; mov.u32 %r4799, 1; mov.b32 {%rs1017, %rs247}, %r1840; bra.uni $L__BB1_132; $L__BB1_92: shr.u64 %rd11313, %rd203, 32; cvt.u32.u64 %r4703, %rd11313; mov.b32 %f5224, %r4703; cvt.u32.u64 %r4702, %rd203; mov.b32 %f5223, %r4702; sub.f32 %f69, %f49, %f5223; sub.f32 %f70, %f50, %f5224; mul.f32 %f1798, %f56, %f70; fma.rn.f32 %f71, %f54, %f69, %f1798; mul.f32 %f1799, %f60, %f70; fma.rn.f32 %f72, %f58, %f69, %f1799; setp.ge.f32 %p224, %f72, 0f00000000; setp.le.f32 %p225, %f71, %f72; and.pred %p226, %p225, %p224; @%p226 bra $L__BB1_120; bra.uni $L__BB1_93; $L__BB1_120: add.u64 %rd11492, %SP, 552; add.u64 %rd11498, %SP, 0; st.local.u64 [%rd3], %rd203; mov.u64 %rd11503, 2; mov.u64 %rd11489, %rd23; mov.u64 %rd11490, %rd10; mov.u64 %rd11491, %rd10; mov.u64 %rd11493, %rd10; mov.u64 %rd11494, %rd10; mov.u64 %rd11495, %rd11492; mov.u64 %rd11496, %rd3; mov.u64 %rd11497, %rd3; mov.u64 %rd11499, %rd3; mov.u64 %rd11500, %rd3; mov.u64 %rd11501, %rd11498; mov.u64 %rd11502, %rd19; $L__BB1_121: setp.eq.s64 %p263, %rd11503, 0; mov.u64 %rd11504, 1; @%p263 bra $L__BB1_123; add.s64 %rd11503, %rd11503, -1; add.s64 %rd5595, %rd11490, 8; setp.eq.s64 %p264, %rd11493, %rd11489; selp.b64 %rd5596, %rd5595, %rd11493, %p264; add.s64 %rd5597, %rd11491, 8; selp.b64 %rd5598, %rd5597, %rd11494, %p264; add.s64 %rd5599, %rd11492, 8; selp.b64 %rd5600, %rd5599, %rd11495, %p264; mov.u64 %rd11504, 0; setp.eq.s64 %p265, %rd11503, 0; add.s64 %rd5601, %rd5596, 4; add.s64 %rd5602, %rd5598, 4; add.s64 %rd5603, %rd5600, 4; selp.b64 %rd353, %rd5596, %rd5601, %p265; selp.b64 %rd11494, %rd5598, %rd5602, %p265; selp.b64 %rd11495, %rd5600, %rd5603, %p265; selp.b64 %rd11490, %rd5595, %rd11490, %p264; selp.b64 %rd11491, %rd5597, %rd11491, %p264; selp.b64 %rd11492, %rd5599, %rd11492, %p264; add.s64 %rd5604, %rd11493, 8; selp.b64 %rd11489, %rd5604, %rd11489, %p264; add.s64 %rd5605, %rd11499, 8; setp.eq.s64 %p266, %rd11496, %rd11502; selp.b64 %rd5606, %rd5605, %rd11496, %p266; add.s64 %rd5607, %rd11500, 8; selp.b64 %rd5608, %rd5607, %rd11497, %p266; add.s64 %rd5609, %rd11501, 8; selp.b64 %rd5610, %rd5609, %rd11498, %p266; selp.b64 %rd11499, %rd5605, %rd11499, %p266; selp.b64 %rd11500, %rd5607, %rd11500, %p266; selp.b64 %rd11501, %rd5609, %rd11501, %p266; add.s64 %rd5611, %rd11496, 8; selp.b64 %rd11502, %rd5611, %rd11502, %p266; add.s64 %rd5612, %rd5606, 4; add.s64 %rd5613, %rd5608, 4; add.s64 %rd5614, %rd5610, 4; selp.b64 %rd11496, %rd5606, %rd5612, %p265; selp.b64 %rd11497, %rd5608, %rd5613, %p265; selp.b64 %rd11498, %rd5610, %rd5614, %p265; ld.local.f32 %f1857, [%rd5608]; ld.local.f32 %f1858, [%rd5598]; setp.eq.f32 %p267, %f1858, %f1857; mov.u64 %rd11493, %rd353; @%p267 bra $L__BB1_121; $L__BB1_123: cvt.u32.u64 %r4733, %rd203; mov.u64 %rd11010, 0; or.b64 %rd5616, %rd11010, %rd203; mov.b64 {%r1830, %r1831}, %rd5616; mov.b64 {%r1832, %r1833}, %rd11504; cvt.u32.u64 %r1835, %rd11010; or.b32 %r4797, %r1835, %r4733; mov.u32 %r4798, 0; mov.b32 %f5324, %r1831; mov.b32 {%rs1017, %rs243}, %r1832; mov.u32 %r4799, 2; bra.uni $L__BB1_132; $L__BB1_93: cvt.u32.u64 %r4708, %rd200; mov.b32 %f5231, %r4708; sub.f32 %f5230, %f49, %f5231; shr.u64 %rd11315, %rd200, 32; cvt.u32.u64 %r4707, %rd11315; mov.b32 %f5229, %r4707; sub.f32 %f5228, %f50, %f5229; shr.u64 %rd11314, %rd203, 32; cvt.u32.u64 %r4706, %rd11314; mov.b32 %f5227, %r4706; cvt.u32.u64 %r4705, %rd203; mov.b32 %f5226, %r4705; cvt.u32.u64 %r4704, %rd202; mov.b32 %f5225, %r4704; sub.f32 %f73, %f5226, %f5225; sub.f32 %f74, %f5227, %f5320; mul.f32 %f1800, %f56, %f58; mul.f32 %f1801, %f54, %f60; sub.f32 %f75, %f1801, %f1800; mul.f32 %f1802, %f56, %f5230; mul.f32 %f1803, %f54, %f5228; sub.f32 %f1804, %f1803, %f1802; mul.f32 %f1805, %f75, %f1804; setp.lt.f32 %p227, %f1805, 0f00000000; setp.ge.f32 %p228, %f63, 0f00000000; and.pred %p229, %p228, %p227; setp.le.f32 %p230, %f67, 0f00000000; and.pred %p231, %p230, %p229; mov.u16 %rs1016, 0; @%p231 bra $L__BB1_96; cvt.u32.u64 %r4737, %rd203; mov.b32 %f5254, %r4737; sub.f32 %f5253, %f49, %f5254; shr.u64 %rd11336, %rd203, 32; cvt.u32.u64 %r4736, %rd11336; mov.b32 %f5252, %r4736; sub.f32 %f5251, %f50, %f5252; mul.f32 %f1806, %f58, %f5251; mul.f32 %f1807, %f5253, %f60; sub.f32 %f1808, %f1806, %f1807; mul.f32 %f1809, %f75, %f1808; setp.gt.f32 %p232, %f1809, 0f80000000; setp.ge.f32 %p233, %f64, 0f00000000; and.pred %p234, %p233, %p232; setp.le.f32 %p235, %f72, 0f00000000; and.pred %p236, %p235, %p234; mov.u16 %rs1016, 1; @%p236 bra $L__BB1_96; mul.f32 %f1810, %f73, %f66; mul.f32 %f1811, %f65, %f74; sub.f32 %f1812, %f1810, %f1811; mul.f32 %f1813, %f75, %f1812; setp.lt.f32 %p237, %f1813, 0f00000000; sub.f32 %f1814, %f68, %f67; setp.ge.f32 %p238, %f1814, 0f00000000; and.pred %p239, %p238, %p237; sub.f32 %f1815, %f71, %f72; setp.ge.f32 %p240, %f1815, 0f00000000; and.pred %p241, %p240, %p239; selp.b16 %rs1016, 2, 3, %p241; $L__BB1_96: mul.f32 %f1816, %f56, %f56; fma.rn.f32 %f1817, %f54, %f54, %f1816; add.f32 %f76, %f1817, 0f00000000; mul.f32 %f1818, %f60, %f60; fma.rn.f32 %f1819, %f58, %f58, %f1818; add.f32 %f77, %f1819, 0f00000000; mul.f32 %f1820, %f74, %f74; fma.rn.f32 %f1821, %f73, %f73, %f1820; add.f32 %f78, %f1821, 0f00000000; setp.eq.s16 %p242, %rs1016, 1; @%p242 bra $L__BB1_111; setp.eq.s16 %p243, %rs1016, 2; @%p243 bra $L__BB1_107; setp.ne.s16 %p244, %rs1016, 3; @%p244 bra $L__BB1_115; cvt.u32.u64 %r4726, %rd200; mov.b32 %f5244, %r4726; sub.f32 %f5243, %f49, %f5244; shr.u64 %rd11335, %rd200, 32; cvt.u32.u64 %r4725, %rd11335; mov.b32 %f5242, %r4725; sub.f32 %f5241, %f50, %f5242; sub.f32 %f1822, %f63, %f67; div.rn.f32 %f79, %f63, %f1822; sub.f32 %f1823, %f64, %f72; div.rn.f32 %f80, %f64, %f1823; sub.f32 %f1824, %f68, %f67; add.f32 %f1825, %f71, %f1824; sub.f32 %f1826, %f1825, %f72; div.rn.f32 %f5322, %f1824, %f1826; mul.f32 %f1827, %f5241, %f5241; fma.rn.f32 %f1828, %f5243, %f5243, %f1827; add.f32 %f1829, %f1828, 0f00000000; mul.f32 %f1830, %f76, %f79; mul.f32 %f1831, %f79, %f1830; sub.f32 %f82, %f1829, %f1831; mul.f32 %f1832, %f77, %f5322; mul.f32 %f1833, %f5322, %f1832; sub.f32 %f83, %f1829, %f1833; mul.f32 %f1834, %f66, %f66; fma.rn.f32 %f1835, %f65, %f65, %f1834; add.f32 %f1836, %f1835, 0f00000000; mul.f32 %f1837, %f78, %f80; mul.f32 %f1838, %f80, %f1837; sub.f32 %f84, %f1836, %f1838; setp.lt.f32 %p245, %f82, %f83; @%p245 bra $L__BB1_103; bra.uni $L__BB1_100; $L__BB1_103: setp.lt.f32 %p247, %f82, %f84; @%p247 bra $L__BB1_105; bra.uni $L__BB1_104; $L__BB1_105: cvt.u32.u64 %r4730, %rd200; mov.b32 %f5248, %r4730; mul.f32 %f5321, %f56, %f79; fma.rn.f32 %f5319, %f54, %f79, %f5248; mov.u32 %r4799, 0; mov.f32 %f5320, %f52; mov.f32 %f5322, %f79; bra.uni $L__BB1_106; $L__BB1_111: cvt.u32.u64 %r4732, %rd200; mov.b32 %f5250, %r4732; add.u64 %rd11458, %SP, 552; add.u64 %rd11464, %SP, 0; div.rn.f32 %f5323, %f64, %f77; fma.rn.f32 %f1847, %f58, %f5323, %f5250; mov.b32 %r1815, %f1847; fma.rn.f32 %f1848, %f60, %f5323, %f52; mov.b32 %r1816, %f1848; cvt.u64.u32 %rd5535, %r1816; cvt.u64.u32 %rd5536, %r1815; bfi.b64 %rd252, %rd5535, %rd5536, 32, 32; st.local.u64 [%rd5], %rd252; mov.u64 %rd11469, 2; mov.u64 %rd11455, %rd23; mov.u64 %rd11456, %rd10; mov.u64 %rd11457, %rd10; mov.u64 %rd11459, %rd10; mov.u64 %rd11460, %rd10; mov.u64 %rd11461, %rd11458; mov.u64 %rd11462, %rd5; mov.u64 %rd11463, %rd5; mov.u64 %rd11465, %rd5; mov.u64 %rd11466, %rd5; mov.u64 %rd11467, %rd11464; mov.u64 %rd11468, %rd21; $L__BB1_112: setp.eq.s64 %p253, %rd11469, 0; mov.u64 %rd11488, 1; @%p253 bra $L__BB1_114; add.s64 %rd11469, %rd11469, -1; add.s64 %rd5541, %rd11456, 8; setp.eq.s64 %p254, %rd11459, %rd11455; selp.b64 %rd5542, %rd5541, %rd11459, %p254; add.s64 %rd5543, %rd11457, 8; selp.b64 %rd5544, %rd5543, %rd11460, %p254; add.s64 %rd5545, %rd11458, 8; selp.b64 %rd5546, %rd5545, %rd11461, %p254; mov.u64 %rd11488, 0; setp.eq.s64 %p255, %rd11469, 0; add.s64 %rd5547, %rd5542, 4; add.s64 %rd5548, %rd5544, 4; add.s64 %rd5549, %rd5546, 4; selp.b64 %rd269, %rd5542, %rd5547, %p255; selp.b64 %rd11460, %rd5544, %rd5548, %p255; selp.b64 %rd11461, %rd5546, %rd5549, %p255; selp.b64 %rd11456, %rd5541, %rd11456, %p254; selp.b64 %rd11457, %rd5543, %rd11457, %p254; selp.b64 %rd11458, %rd5545, %rd11458, %p254; add.s64 %rd5550, %rd11459, 8; selp.b64 %rd11455, %rd5550, %rd11455, %p254; add.s64 %rd5551, %rd11465, 8; setp.eq.s64 %p256, %rd11462, %rd11468; selp.b64 %rd5552, %rd5551, %rd11462, %p256; add.s64 %rd5553, %rd11466, 8; selp.b64 %rd5554, %rd5553, %rd11463, %p256; add.s64 %rd5555, %rd11467, 8; selp.b64 %rd5556, %rd5555, %rd11464, %p256; selp.b64 %rd11465, %rd5551, %rd11465, %p256; selp.b64 %rd11466, %rd5553, %rd11466, %p256; selp.b64 %rd11467, %rd5555, %rd11467, %p256; add.s64 %rd5557, %rd11462, 8; selp.b64 %rd11468, %rd5557, %rd11468, %p256; add.s64 %rd5558, %rd5552, 4; add.s64 %rd5559, %rd5554, 4; add.s64 %rd5560, %rd5556, 4; selp.b64 %rd11462, %rd5552, %rd5558, %p255; selp.b64 %rd11463, %rd5554, %rd5559, %p255; selp.b64 %rd11464, %rd5556, %rd5560, %p255; ld.local.f32 %f1849, [%rd5554]; ld.local.f32 %f1850, [%rd5544]; setp.eq.f32 %p257, %f1850, %f1849; mov.u64 %rd11459, %rd269; @%p257 bra $L__BB1_112; $L__BB1_114: mov.u64 %rd11008, 0; or.b64 %rd11487, %rd11008, %rd252; mov.u32 %r4799, 2; bra.uni $L__BB1_119; $L__BB1_107: cvt.u32.u64 %r4731, %rd202; mov.b32 %f5249, %r4731; add.u64 %rd11442, %SP, 552; add.u64 %rd11448, %SP, 0; mul.f32 %f1841, %f74, %f66; fma.rn.f32 %f1842, %f73, %f65, %f1841; div.rn.f32 %f5323, %f1842, %f78; fma.rn.f32 %f1843, %f73, %f5323, %f5249; mov.b32 %r1812, %f1843; fma.rn.f32 %f1844, %f74, %f5323, %f5320; mov.b32 %r1813, %f1844; cvt.u64.u32 %rd5508, %r1813; cvt.u64.u32 %rd5509, %r1812; bfi.b64 %rd211, %rd5508, %rd5509, 32, 32; st.local.u64 [%rd6], %rd211; mov.u64 %rd11453, 2; mov.u64 %rd11439, %rd23; mov.u64 %rd11440, %rd10; mov.u64 %rd11441, %rd10; mov.u64 %rd11443, %rd10; mov.u64 %rd11444, %rd10; mov.u64 %rd11445, %rd11442; mov.u64 %rd11446, %rd6; mov.u64 %rd11447, %rd6; mov.u64 %rd11449, %rd6; mov.u64 %rd11450, %rd6; mov.u64 %rd11451, %rd11448; mov.u64 %rd11452, %rd22; $L__BB1_108: setp.eq.s64 %p248, %rd11453, 0; mov.u64 %rd11488, 1; @%p248 bra $L__BB1_110; add.s64 %rd11453, %rd11453, -1; add.s64 %rd5514, %rd11440, 8; setp.eq.s64 %p249, %rd11443, %rd11439; selp.b64 %rd5515, %rd5514, %rd11443, %p249; add.s64 %rd5516, %rd11441, 8; selp.b64 %rd5517, %rd5516, %rd11444, %p249; add.s64 %rd5518, %rd11442, 8; selp.b64 %rd5519, %rd5518, %rd11445, %p249; mov.u64 %rd11488, 0; setp.eq.s64 %p250, %rd11453, 0; add.s64 %rd5520, %rd5515, 4; add.s64 %rd5521, %rd5517, 4; add.s64 %rd5522, %rd5519, 4; selp.b64 %rd228, %rd5515, %rd5520, %p250; selp.b64 %rd11444, %rd5517, %rd5521, %p250; selp.b64 %rd11445, %rd5519, %rd5522, %p250; selp.b64 %rd11440, %rd5514, %rd11440, %p249; selp.b64 %rd11441, %rd5516, %rd11441, %p249; selp.b64 %rd11442, %rd5518, %rd11442, %p249; add.s64 %rd5523, %rd11443, 8; selp.b64 %rd11439, %rd5523, %rd11439, %p249; add.s64 %rd5524, %rd11449, 8; setp.eq.s64 %p251, %rd11446, %rd11452; selp.b64 %rd5525, %rd5524, %rd11446, %p251; add.s64 %rd5526, %rd11450, 8; selp.b64 %rd5527, %rd5526, %rd11447, %p251; add.s64 %rd5528, %rd11451, 8; selp.b64 %rd5529, %rd5528, %rd11448, %p251; selp.b64 %rd11449, %rd5524, %rd11449, %p251; selp.b64 %rd11450, %rd5526, %rd11450, %p251; selp.b64 %rd11451, %rd5528, %rd11451, %p251; add.s64 %rd5530, %rd11446, 8; selp.b64 %rd11452, %rd5530, %rd11452, %p251; add.s64 %rd5531, %rd5525, 4; add.s64 %rd5532, %rd5527, 4; add.s64 %rd5533, %rd5529, 4; selp.b64 %rd11446, %rd5525, %rd5531, %p250; selp.b64 %rd11447, %rd5527, %rd5532, %p250; selp.b64 %rd11448, %rd5529, %rd5533, %p250; ld.local.f32 %f1845, [%rd5527]; ld.local.f32 %f1846, [%rd5517]; setp.eq.f32 %p252, %f1846, %f1845; mov.u64 %rd11443, %rd228; @%p252 bra $L__BB1_108; $L__BB1_110: mov.u64 %rd11007, 0; or.b64 %rd11487, %rd11007, %rd211; mov.u32 %r4799, 1; bra.uni $L__BB1_119; $L__BB1_115: cvt.u32.u64 %r4709, %rd200; mov.b32 %f5232, %r4709; div.rn.f32 %f5323, %f63, %f76; fma.rn.f32 %f1851, %f54, %f5323, %f5232; mov.b32 %r1818, %f1851; fma.rn.f32 %f1852, %f56, %f5323, %f52; mov.b32 %r1819, %f1852; cvt.u64.u32 %rd5562, %r1819; cvt.u64.u32 %rd5563, %r1818; bfi.b64 %rd293, %rd5562, %rd5563, 32, 32; st.local.u64 [%rd4], %rd293; mov.u64 %rd11485, 2; mov.u64 %rd11471, %rd23; mov.u64 %rd11472, %rd10; mov.u64 %rd11473, %rd10; mov.u64 %rd11474, %rd5244; mov.u64 %rd11475, %rd10; mov.u64 %rd11476, %rd10; mov.u64 %rd11477, %rd5244; mov.u64 %rd11478, %rd4; mov.u64 %rd11479, %rd4; mov.u64 %rd11480, %rd5235; mov.u64 %rd11481, %rd4; mov.u64 %rd11482, %rd4; mov.u64 %rd11483, %rd5235; mov.u64 %rd11484, %rd20; $L__BB1_116: setp.eq.s64 %p258, %rd11485, 0; mov.u64 %rd11488, 1; @%p258 bra $L__BB1_118; add.s64 %rd11485, %rd11485, -1; add.s64 %rd5568, %rd11472, 8; setp.eq.s64 %p259, %rd11475, %rd11471; selp.b64 %rd5569, %rd5568, %rd11475, %p259; add.s64 %rd5570, %rd11473, 8; selp.b64 %rd5571, %rd5570, %rd11476, %p259; add.s64 %rd5572, %rd11474, 8; selp.b64 %rd5573, %rd5572, %rd11477, %p259; mov.u64 %rd11488, 0; setp.eq.s64 %p260, %rd11485, 0; add.s64 %rd5574, %rd5569, 4; add.s64 %rd5575, %rd5571, 4; add.s64 %rd5576, %rd5573, 4; selp.b64 %rd310, %rd5569, %rd5574, %p260; selp.b64 %rd11476, %rd5571, %rd5575, %p260; selp.b64 %rd11477, %rd5573, %rd5576, %p260; selp.b64 %rd11472, %rd5568, %rd11472, %p259; selp.b64 %rd11473, %rd5570, %rd11473, %p259; selp.b64 %rd11474, %rd5572, %rd11474, %p259; add.s64 %rd5577, %rd11475, 8; selp.b64 %rd11471, %rd5577, %rd11471, %p259; add.s64 %rd5578, %rd11481, 8; setp.eq.s64 %p261, %rd11478, %rd11484; selp.b64 %rd5579, %rd5578, %rd11478, %p261; add.s64 %rd5580, %rd11482, 8; selp.b64 %rd5581, %rd5580, %rd11479, %p261; add.s64 %rd5582, %rd11483, 8; selp.b64 %rd5583, %rd5582, %rd11480, %p261; selp.b64 %rd11481, %rd5578, %rd11481, %p261; selp.b64 %rd11482, %rd5580, %rd11482, %p261; selp.b64 %rd11483, %rd5582, %rd11483, %p261; add.s64 %rd5584, %rd11478, 8; selp.b64 %rd11484, %rd5584, %rd11484, %p261; add.s64 %rd5585, %rd5579, 4; add.s64 %rd5586, %rd5581, 4; add.s64 %rd5587, %rd5583, 4; selp.b64 %rd11478, %rd5579, %rd5585, %p260; selp.b64 %rd11479, %rd5581, %rd5586, %p260; selp.b64 %rd11480, %rd5583, %rd5587, %p260; ld.local.f32 %f1853, [%rd5581]; ld.local.f32 %f1854, [%rd5571]; setp.eq.f32 %p262, %f1854, %f1853; mov.u64 %rd11475, %rd310; @%p262 bra $L__BB1_116; $L__BB1_118: mov.u64 %rd11009, 0; or.b64 %rd11487, %rd11009, %rd293; mov.u32 %r4799, 0; $L__BB1_119: mov.f32 %f1855, 0f3F800000; sub.f32 %f1856, %f1855, %f5323; mov.b32 %r1822, %f1856; mov.b32 %r1823, %f5323; cvt.u64.u32 %rd5588, %r1823; cvt.u64.u32 %rd5589, %r1822; bfi.b64 %rd11537, %rd5588, %rd5589, 32, 32; mov.b64 {%r1824, %r1825}, %rd11488; mov.b64 {%r1826, %r1827}, %rd11487; cvt.u32.u64 %r4797, %rd11487; mov.b32 %f5324, %r1827; mov.u32 %r4798, 1; mov.b32 {%rs1017, %rs239}, %r1824; bra.uni $L__BB1_132; $L__BB1_100: setp.lt.f32 %p246, %f83, %f84; @%p246 bra $L__BB1_102; bra.uni $L__BB1_101; $L__BB1_102: cvt.u32.u64 %r4728, %rd200; mov.b32 %f5246, %r4728; mul.f32 %f5321, %f60, %f80; fma.rn.f32 %f5319, %f58, %f80, %f5246; mov.u32 %r4799, 2; mov.f32 %f5320, %f52; mov.f32 %f5322, %f80; bra.uni $L__BB1_106; $L__BB1_104: cvt.u32.u64 %r4729, %rd202; mov.b32 %f5247, %r4729; mul.f32 %f5321, %f74, %f5322; fma.rn.f32 %f5319, %f73, %f5322, %f5247; mov.u32 %r4799, 1; bra.uni $L__BB1_106; $L__BB1_101: cvt.u32.u64 %r4727, %rd202; mov.b32 %f5245, %r4727; mul.f32 %f5321, %f74, %f5322; fma.rn.f32 %f5319, %f73, %f5322, %f5245; mov.u32 %r4799, 1; $L__BB1_106: add.f32 %f5324, %f5320, %f5321; mov.f32 %f1839, 0f3F800000; sub.f32 %f1840, %f1839, %f5322; mov.b32 %r1810, %f1840; mov.b32 %r1811, %f5322; cvt.u64.u32 %rd5505, %r1811; cvt.u64.u32 %rd5506, %r1810; bfi.b64 %rd11537, %rd5505, %rd5506, 32, 32; mov.b32 %r4797, %f5319; mov.u32 %r4798, 1; mov.u16 %rs1017, 1; $L__BB1_132: mov.b32 %f1863, %r4797; sub.f32 %f1864, %f1863, %f49; sub.f32 %f1865, %f5324, %f50; mul.f32 %f1866, %f1865, %f1865; fma.rn.f32 %f1867, %f1864, %f1864, %f1866; add.f32 %f1868, %f1867, 0f00000000; sqrt.rn.f32 %f1869, %f1868; shl.b64 %rd5671, %rd192, 2; add.s64 %rd5672, %rd24, %rd5671; st.local.f32 [%rd5672+-4], %f1869; mul.lo.s64 %rd5673, %rd192, 36; add.s64 %rd5674, %rd7, %rd5673; st.local.u32 [%rd5674+-36], %r4797; st.local.f32 [%rd5674+-32], %f5324; mov.u16 %rs252, 0; st.local.v4.u8 [%rd5674+-28], {%rs1017, %rs252, %rs252, %rs252}; st.local.u32 [%rd5674+-24], %r46; st.local.u32 [%rd5674+-20], %r4798; st.local.u32 [%rd5674+-16], %r4799; shr.u64 %rd5675, %rd11537, 32; st.local.u32 [%rd5674+-8], %rd5675; st.local.u32 [%rd5674+-12], %rd11537; $L__BB1_133: setp.lt.u64 %p278, %rd192, 4; add.s64 %rd192, %rd192, 1; @%p278 bra $L__BB1_80; ld.local.v2.u64 {%rd11538, %rd11539}, [%rd24]; ld.local.v4.u32 {%r4809, %r4810, %r4811, %r1855}, [%rd7]; ld.local.u32 %r4812, [%rd7+16]; ld.local.u32 %rd5680, [%rd7+36]; ld.local.u32 %rd5681, [%rd7+40]; bfi.b64 %rd5682, %rd5681, %rd5680, 32, 32; mov.b64 {%r4806, %r4807}, %rd5682; ld.local.u32 %r4808, [%rd7+44]; ld.local.u32 %r4813, [%rd7+52]; ld.local.u32 %r4805, [%rd7+80]; ld.local.u64 %rd5683, [%rd7+72]; mov.b64 {%r4803, %r4804}, %rd5683; ld.local.u32 %r4814, [%rd7+88]; ld.local.u32 %rd5684, [%rd7+108]; ld.local.u32 %rd5685, [%rd7+112]; bfi.b64 %rd5686, %rd5685, %rd5684, 32, 32; mov.b64 {%r4800, %r4801}, %rd5686; ld.local.u32 %r4802, [%rd7+116]; ld.local.u32 %r4815, [%rd7+124]; bra.uni $L__BB1_135; $L__BB1_78: mov.u32 %r4812, 4; mov.u32 %r4813, %r4812; mov.u32 %r4814, %r4812; mov.u32 %r4815, %r4812; $L__BB1_135: and.b64 %rd5687, %rd189, 1; setp.eq.b64 %p279, %rd5687, 1; mov.pred %p280, 0; xor.pred %p281, %p279, %p280; not.pred %p282, %p281; mov.b64 {%r89, %r90}, %rd11538; mov.b64 {%r91, %r92}, %rd11539; @%p282 bra $L__BB1_144; ld.u8 %rs253, [%rd178+88]; and.b16 %rs254, %rs253, 1; setp.eq.b16 %p283, %rs254, 1; xor.pred %p285, %p283, %p280; not.pred %p286, %p285; @%p286 bra $L__BB1_139; bra.uni $L__BB1_137; $L__BB1_139: ld.u32 %r98, [%rd178+64]; cvt.u64.u32 %rd5691, %r98; setp.le.u64 %p293, %rd166, %rd5691; @%p293 bra $L__BB1_144; mov.b32 %f5268, %r89; neg.f32 %f111, %f5268; setp.lt.u32 %p294, %r45, 64; @%p294 bra $L__BB1_142; bra.uni $L__BB1_141; $L__BB1_142: add.s32 %r1858, %r44, -1; mul.wide.u32 %rd5701, %r1858, 8; add.s64 %rd5702, %rd9, %rd5701; mov.u64 %rd11540, 0; st.local.u32 [%rd5702], %r98; st.local.f32 [%rd5702+4], %f111; add.s32 %r45, %r45, 1; st.local.u32 [%rd9+512], %r45; mov.u64 %rd11541, %rd11540; bra.uni $L__BB1_143; $L__BB1_137: mov.b32 %f5267, %r89; mov.b32 %f5233, %r43; setp.leu.f32 %p287, %f5233, %f5267; setp.eq.s32 %p288, %r4812, 4; or.pred %p289, %p288, %p287; @%p289 bra $L__BB1_144; ld.u32 %r1856, [%rd178+64]; cvt.u64.u32 %rd5688, %r1856; setp.le.u64 %p290, %rd169, %rd5688; mul.wide.u32 %rd5689, %r1856, 12; add.s64 %rd5690, %rd170, %rd5689; setp.eq.s64 %p291, %rd5690, 0; or.pred %p292, %p290, %p291; selp.b32 %r40, %r40, %r4811, %p292; selp.b32 %r39, %r39, %r4810, %p292; selp.b32 %r38, %r38, %r4809, %p292; selp.b32 %r42, %r42, %r4812, %p292; selp.b32 %r43, %r43, %r89, %p292; bra.uni $L__BB1_144; $L__BB1_141: cvt.u64.u32 %rd11343, %r98; mov.u64 %rd11541, 1; shl.b64 %rd11540, %rd11343, 32; $L__BB1_143: mov.u64 %rd11013, 0; cvt.u32.u64 %r1859, %rd11013; cvt.u32.u64 %r1860, %rd11540; or.b32 %r1861, %r1860, %r1859; cvt.u32.u64 %r1862, %rd11541; or.b32 %r1863, %r1861, %r1862; setp.ne.s32 %p295, %r1863, 0; @%p295 bra $L__BB1_171; $L__BB1_144: and.b64 %rd5703, %rd189, 2; setp.eq.s64 %p296, %rd5703, 0; @%p296 bra $L__BB1_153; ld.u8 %rs255, [%rd178+88]; and.b16 %rs256, %rs255, 1; setp.eq.b16 %p297, %rs256, 1; mov.pred %p298, 0; xor.pred %p299, %p297, %p298; not.pred %p300, %p299; @%p300 bra $L__BB1_148; bra.uni $L__BB1_146; $L__BB1_148: ld.u32 %r112, [%rd178+68]; cvt.u64.u32 %rd5707, %r112; setp.le.u64 %p307, %rd166, %rd5707; @%p307 bra $L__BB1_153; mov.b32 %f5270, %r90; neg.f32 %f112, %f5270; setp.lt.u32 %p308, %r45, 64; @%p308 bra $L__BB1_151; bra.uni $L__BB1_150; $L__BB1_151: mul.wide.u32 %rd5717, %r45, 8; add.s64 %rd5718, %rd9, %rd5717; mov.u64 %rd11542, 0; st.local.u32 [%rd5718], %r112; st.local.f32 [%rd5718+4], %f112; add.s32 %r45, %r45, 1; st.local.u32 [%rd9+512], %r45; mov.u64 %rd11543, %rd11542; bra.uni $L__BB1_152; $L__BB1_146: mov.b32 %f5269, %r90; mov.b32 %f1871, %r43; setp.leu.f32 %p301, %f1871, %f5269; setp.eq.s32 %p302, %r4813, 4; or.pred %p303, %p302, %p301; @%p303 bra $L__BB1_153; ld.u32 %r1864, [%rd178+68]; cvt.u64.u32 %rd5704, %r1864; setp.le.u64 %p304, %rd169, %rd5704; mul.wide.u32 %rd5705, %r1864, 12; add.s64 %rd5706, %rd170, %rd5705; setp.eq.s64 %p305, %rd5706, 0; or.pred %p306, %p304, %p305; selp.b32 %r40, %r40, %r4808, %p306; selp.b32 %r39, %r39, %r4807, %p306; selp.b32 %r38, %r38, %r4806, %p306; selp.b32 %r42, %r42, %r4813, %p306; selp.b32 %r43, %r43, %r90, %p306; bra.uni $L__BB1_153; $L__BB1_150: mov.u64 %rd11543, 1; shl.b64 %rd11542, %rd5707, 32; $L__BB1_152: mov.u64 %rd11016, 0; cvt.u32.u64 %r1866, %rd11016; cvt.u32.u64 %r1867, %rd11542; or.b32 %r1868, %r1867, %r1866; cvt.u32.u64 %r1869, %rd11543; or.b32 %r1870, %r1868, %r1869; setp.ne.s32 %p309, %r1870, 0; @%p309 bra $L__BB1_171; $L__BB1_153: and.b64 %rd5719, %rd189, 4; setp.eq.s64 %p310, %rd5719, 0; @%p310 bra $L__BB1_162; ld.u8 %rs257, [%rd178+88]; and.b16 %rs258, %rs257, 1; setp.eq.b16 %p311, %rs258, 1; mov.pred %p312, 0; xor.pred %p313, %p311, %p312; not.pred %p314, %p313; @%p314 bra $L__BB1_157; bra.uni $L__BB1_155; $L__BB1_157: ld.u32 %r126, [%rd178+72]; cvt.u64.u32 %rd5723, %r126; setp.le.u64 %p321, %rd166, %rd5723; @%p321 bra $L__BB1_162; mov.b32 %f5272, %r91; neg.f32 %f113, %f5272; setp.lt.u32 %p322, %r45, 64; @%p322 bra $L__BB1_160; bra.uni $L__BB1_159; $L__BB1_160: mul.wide.u32 %rd5733, %r45, 8; add.s64 %rd5734, %rd9, %rd5733; mov.u64 %rd11544, 0; st.local.u32 [%rd5734], %r126; st.local.f32 [%rd5734+4], %f113; add.s32 %r45, %r45, 1; st.local.u32 [%rd9+512], %r45; mov.u64 %rd11545, %rd11544; bra.uni $L__BB1_161; $L__BB1_155: mov.b32 %f5271, %r91; mov.b32 %f1872, %r43; setp.leu.f32 %p315, %f1872, %f5271; setp.eq.s32 %p316, %r4814, 4; or.pred %p317, %p316, %p315; @%p317 bra $L__BB1_162; ld.u32 %r1871, [%rd178+72]; cvt.u64.u32 %rd5720, %r1871; setp.le.u64 %p318, %rd169, %rd5720; mul.wide.u32 %rd5721, %r1871, 12; add.s64 %rd5722, %rd170, %rd5721; setp.eq.s64 %p319, %rd5722, 0; or.pred %p320, %p318, %p319; selp.b32 %r40, %r40, %r4805, %p320; selp.b32 %r39, %r39, %r4804, %p320; selp.b32 %r38, %r38, %r4803, %p320; selp.b32 %r42, %r42, %r4814, %p320; selp.b32 %r43, %r43, %r91, %p320; bra.uni $L__BB1_162; $L__BB1_159: mov.u64 %rd11545, 1; shl.b64 %rd11544, %rd5723, 32; $L__BB1_161: mov.u64 %rd11019, 0; cvt.u32.u64 %r1873, %rd11019; cvt.u32.u64 %r1874, %rd11544; or.b32 %r1875, %r1874, %r1873; cvt.u32.u64 %r1876, %rd11545; or.b32 %r1877, %r1875, %r1876; setp.ne.s32 %p323, %r1877, 0; @%p323 bra $L__BB1_171; $L__BB1_162: and.b64 %rd5735, %rd189, 8; setp.eq.s64 %p324, %rd5735, 0; @%p324 bra $L__BB1_71; ld.u8 %rs259, [%rd178+88]; and.b16 %rs260, %rs259, 1; setp.eq.b16 %p325, %rs260, 1; mov.pred %p326, 0; xor.pred %p327, %p325, %p326; not.pred %p328, %p327; @%p328 bra $L__BB1_166; bra.uni $L__BB1_164; $L__BB1_166: ld.u32 %r140, [%rd178+76]; cvt.u64.u32 %rd5739, %r140; setp.le.u64 %p335, %rd166, %rd5739; @%p335 bra $L__BB1_71; mov.b32 %f5274, %r92; neg.f32 %f114, %f5274; setp.lt.u32 %p336, %r45, 64; @%p336 bra $L__BB1_169; bra.uni $L__BB1_168; $L__BB1_169: mul.wide.u32 %rd5749, %r45, 8; add.s64 %rd5750, %rd9, %rd5749; mov.u64 %rd11546, 0; st.local.u32 [%rd5750], %r140; st.local.f32 [%rd5750+4], %f114; add.s32 %r45, %r45, 1; st.local.u32 [%rd9+512], %r45; mov.u64 %rd11547, %rd11546; bra.uni $L__BB1_170; $L__BB1_164: mov.b32 %f5273, %r92; mov.b32 %f1873, %r43; setp.leu.f32 %p329, %f1873, %f5273; setp.eq.s32 %p330, %r4815, 4; or.pred %p331, %p330, %p329; @%p331 bra $L__BB1_71; bra.uni $L__BB1_165; $L__BB1_168: mov.u64 %rd11547, 1; shl.b64 %rd11546, %rd5739, 32; $L__BB1_170: mov.u64 %rd11022, 0; cvt.u32.u64 %r1880, %rd11022; cvt.u32.u64 %r1881, %rd11546; or.b32 %r1882, %r1881, %r1880; cvt.u32.u64 %r1883, %rd11547; or.b32 %r1884, %r1882, %r1883; setp.eq.s32 %p337, %r1884, 0; @%p337 bra $L__BB1_71; bra.uni $L__BB1_171; $L__BB1_172: mov.u64 %rd11549, 2; mov.u64 %rd11548, 0; setp.eq.s32 %p338, %r42, 4; mov.u64 %rd11550, %rd11548; @%p338 bra $L__BB1_174; mov.b64 %rd11550, {%r38, %r39}; mov.b32 {%rs261, %rs262}, %r40; mov.b64 %rd5757, {%r40, %r1885}; and.b64 %rd11548, %rd5757, 4294967040; cvt.u64.u16 %rd5758, %rs261; and.b64 %rd11549, %rd5758, 255; $L__BB1_174: mov.u64 %rd11551, 2; mov.u64 %rd11552, 0; or.b64 %rd5765, %rd11549, %rd11548; or.b64 %rd5766, %rd5765, %rd11552; mov.b64 {%r1886, %r1887}, %rd5766; mov.b32 {%rs17, %rs263}, %r1886; and.b16 %rs264, %rs17, 255; setp.eq.s16 %p339, %rs264, 2; @%p339 bra $L__BB1_176; cvt.u32.u64 %r1888, %rd11550; mov.b32 %f1874, %r1888; shr.u64 %rd5767, %rd11550, 32; cvt.u32.u64 %r1889, %rd5767; mov.b32 %f1875, %r1889; ld.global.f32 %f1876, [%rd35+248]; mul.f32 %f1877, %f1876, %f1874; ld.global.f32 %f1878, [%rd35+252]; mul.f32 %f1879, %f1878, %f1875; sub.f32 %f1880, %f1877, %f1879; mul.f32 %f1881, %f1878, %f1874; fma.rn.f32 %f1882, %f1876, %f1875, %f1881; ld.global.f32 %f1883, [%rd35+256]; add.f32 %f1884, %f1883, %f1880; mov.b32 %r1890, %f1884; ld.global.f32 %f1885, [%rd35+260]; add.f32 %f1886, %f1885, %f1882; mov.b32 %r1891, %f1886; cvt.u64.u32 %rd5768, %r1891; cvt.u64.u32 %rd5769, %r1890; cvt.u64.u16 %rd5770, %rs17; bfi.b64 %rd11552, %rd5768, %rd5769, 32, 32; and.b64 %rd5771, %rd5770, 255; mov.b64 {%r1892, %r1893}, %rd5771; mov.b32 {%rs265, %rs266}, %r1892; cvt.u64.u16 %rd11551, %rs265; $L__BB1_176: mov.u64 %rd11031, 0; or.b64 %rd5778, %rd11031, %rd11551; or.b64 %rd516, %rd5778, %rd11031; mov.b64 {%r1894, %r1895}, %rd516; mov.b32 {%rs18, %rs267}, %r1894; and.b16 %rs268, %rs18, 255; setp.eq.s16 %p340, %rs268, 2; mov.u64 %rd11553, 2; mov.u64 %rd11554, %rd11031; mov.u64 %rd11555, %rd11031; @%p340 bra $L__BB1_178; and.b64 %rd5780, %rd516, 4294967040; cvt.u64.u16 %rd5781, %rs18; and.b64 %rd5782, %rd5781, 255; or.b64 %rd5783, %rd5782, %rd11031; or.b64 %rd5784, %rd5783, %rd5780; mov.b64 {%r1896, %r1897}, %rd5784; mov.b32 {%rs269, %rs270}, %r1896; not.b16 %rs271, %rs269; ld.global.u8 %rs272, [%rd35+240]; setp.eq.s16 %p341, %rs272, 0; and.b16 %rs273, %rs271, 1; selp.b16 %rs274, %rs269, %rs273, %p341; and.b64 %rd5785, %rd5784, 4294967040; cvt.u64.u16 %rd5786, %rs274; and.b64 %rd5787, %rd5786, 255; or.b64 %rd5788, %rd5785, %rd11031; or.b64 %rd5789, %rd5788, %rd5787; mov.b64 {%r1898, %r1899}, %rd5789; mov.b32 {%rs275, %rs276}, %r1898; and.b64 %rd11555, %rd5789, 4294967040; cvt.u64.u16 %rd5790, %rs275; and.b64 %rd11553, %rd5790, 255; mov.u64 %rd11554, %rd11552; $L__BB1_178: or.b64 %rd5791, %rd11554, %rd11031; or.b64 %rd5792, %rd11031, %rd11553; or.b64 %rd5793, %rd5792, %rd11555; or.b64 %rd5794, %rd5791, %rd11031; mov.b64 {%r4846, %r4847}, %rd5794; mov.b64 {%r4848, %r1900}, %rd5793; bra.uni $L__BB1_235; $L__BB1_38: cvt.u32.u64 %r1693, %rd46; cvt.u32.u64 %r1694, %rd65; rem.u32 %r1695, %r1694, %r1693; cvt.u64.u32 %rd11395, %r1695; $L__BB1_39: shl.b64 %rd5316, %rd11395, 3; add.s64 %rd69, %rd47, %rd5316; ld.u32 %rd5317, [%rd69]; ld.u32 %rd5318, [%rd69+4]; bfi.b64 %rd70, %rd5318, %rd5317, 32, 32; add.s64 %rd71, %rd11395, 1; or.b64 %rd5319, %rd71, %rd46; and.b64 %rd5320, %rd5319, -4294967296; setp.eq.s64 %p158, %rd5320, 0; @%p158 bra $L__BB1_41; rem.u64 %rd11396, %rd71, %rd46; bra.uni $L__BB1_42; $L__BB1_51: cvt.u32.u64 %r1703, %rd46; cvt.u32.u64 %r1704, %rd112; rem.u32 %r1705, %r1704, %r1703; cvt.u64.u32 %rd11412, %r1705; $L__BB1_52: add.u64 %rd11338, %SP, 32; add.u64 %rd5364, %SP, 560; add.u64 %rd5365, %SPL, 560; add.s64 %rd11420, %rd5365, 8; add.s64 %rd11426, %rd5365, 16; or.b64 %rd11422, %rd5364, 8; add.s64 %rd11416, %rd11338, 36; add.s64 %rd11414, %rd45, 36; add.s64 %rd11413, %rd45, 44; shl.b64 %rd5368, %rd11412, 3; add.s64 %rd5369, %rd47, %rd5368; ld.u32 %rd5370, [%rd5369]; ld.u32 %rd5371, [%rd5369+4]; bfi.b64 %rd123, %rd5371, %rd5370, 32, 32; st.local.v2.u64 [%rd5365], {%rd113, %rd123}; mov.u64 %rd11427, 2; mov.u64 %rd11415, %rd11414; mov.u64 %rd11417, %rd11414; mov.u64 %rd11418, %rd11414; mov.u64 %rd11419, %rd11416; mov.u64 %rd11421, %rd11420; mov.u64 %rd11423, %rd11420; mov.u64 %rd11424, %rd11420; mov.u64 %rd11425, %rd11422; $L__BB1_53: setp.eq.s64 %p167, %rd11427, 0; @%p167 bra $L__BB1_56; add.s64 %rd11427, %rd11427, -1; add.s64 %rd5372, %rd11414, 8; setp.eq.s64 %p168, %rd11417, %rd11413; selp.b64 %rd5373, %rd5372, %rd11417, %p168; add.s64 %rd5374, %rd11415, 8; selp.b64 %rd5375, %rd5374, %rd11418, %p168; add.s64 %rd5376, %rd11416, 8; selp.b64 %rd5377, %rd5376, %rd11419, %p168; setp.eq.s64 %p169, %rd11427, 0; add.s64 %rd5378, %rd5373, 4; add.s64 %rd5379, %rd5375, 4; add.s64 %rd5380, %rd5377, 4; selp.b64 %rd140, %rd5373, %rd5378, %p169; selp.b64 %rd11418, %rd5375, %rd5379, %p169; selp.b64 %rd11419, %rd5377, %rd5380, %p169; selp.b64 %rd11414, %rd5372, %rd11414, %p168; selp.b64 %rd11415, %rd5374, %rd11415, %p168; selp.b64 %rd11416, %rd5376, %rd11416, %p168; add.s64 %rd5381, %rd11417, 8; selp.b64 %rd11413, %rd5381, %rd11413, %p168; add.s64 %rd5382, %rd11423, 8; setp.eq.s64 %p170, %rd11420, %rd11426; selp.b64 %rd5383, %rd5382, %rd11420, %p170; add.s64 %rd5384, %rd11424, 8; selp.b64 %rd5385, %rd5384, %rd11421, %p170; add.s64 %rd5386, %rd11425, 8; selp.b64 %rd5387, %rd5386, %rd11422, %p170; selp.b64 %rd11423, %rd5382, %rd11423, %p170; selp.b64 %rd11424, %rd5384, %rd11424, %p170; selp.b64 %rd11425, %rd5386, %rd11425, %p170; add.s64 %rd5388, %rd11420, 8; selp.b64 %rd11426, %rd5388, %rd11426, %p170; add.s64 %rd5389, %rd5383, 4; add.s64 %rd5390, %rd5385, 4; add.s64 %rd5391, %rd5387, 4; selp.b64 %rd11420, %rd5383, %rd5389, %p169; selp.b64 %rd11421, %rd5385, %rd5390, %p169; selp.b64 %rd11422, %rd5387, %rd5391, %p169; ld.local.f32 %f1643, [%rd5385]; ld.local.f32 %f1644, [%rd5375]; setp.eq.f32 %p171, %f1644, %f1643; mov.u64 %rd11417, %rd140; @%p171 bra $L__BB1_53; bra.uni $L__BB1_55; $L__BB1_56: cvt.u32.u64 %r1706, %rd113; mov.b32 %f1645, %r1706; shr.u64 %rd5392, %rd113, 32; cvt.u32.u64 %r1707, %rd5392; mov.b32 %f1646, %r1707; shr.u64 %rd5393, %rd123, 32; cvt.u32.u64 %r1708, %rd5393; cvt.u32.u64 %r1709, %rd123; mov.b32 %f1647, %r1709; sub.f32 %f1648, %f1647, %f1645; mov.b32 %f1649, %r1708; sub.f32 %f1650, %f1649, %f1646; neg.f32 %f5317, %f1648; neg.f32 %f5318, %f1650; bra.uni $L__BB1_57; $L__BB1_41: cvt.u32.u64 %r1696, %rd46; cvt.u32.u64 %r1697, %rd71; rem.u32 %r1698, %r1697, %r1696; cvt.u64.u32 %rd11396, %r1698; $L__BB1_42: add.u64 %rd11337, %SP, 32; add.u64 %rd11406, %SP, 560; cvta.to.local.u64 %rd11404, %rd11406; add.s64 %rd11410, %rd11404, 8; add.s64 %rd11398, %rd45, 44; add.s64 %rd11397, %rd45, 52; add.s64 %rd11400, %rd11337, 44; shl.b64 %rd5324, %rd11396, 3; add.s64 %rd81, %rd47, %rd5324; ld.u32 %rd5325, [%rd81]; ld.u32 %rd5326, [%rd81+4]; bfi.b64 %rd5327, %rd5326, %rd5325, 32, 32; st.local.v2.u64 [%rd11404], {%rd70, %rd5327}; mov.u64 %rd11411, 2; mov.u64 %rd11399, %rd11398; mov.u64 %rd11401, %rd11398; mov.u64 %rd11402, %rd11398; mov.u64 %rd11403, %rd11400; mov.u64 %rd11405, %rd11404; mov.u64 %rd11407, %rd11404; mov.u64 %rd11408, %rd11404; mov.u64 %rd11409, %rd11406; $L__BB1_43: setp.eq.s64 %p159, %rd11411, 0; @%p159 bra $L__BB1_46; add.s64 %rd11411, %rd11411, -1; add.s64 %rd5328, %rd11398, 8; setp.eq.s64 %p160, %rd11401, %rd11397; selp.b64 %rd5329, %rd5328, %rd11401, %p160; add.s64 %rd5330, %rd11399, 8; selp.b64 %rd5331, %rd5330, %rd11402, %p160; add.s64 %rd5332, %rd11400, 8; selp.b64 %rd5333, %rd5332, %rd11403, %p160; setp.eq.s64 %p161, %rd11411, 0; add.s64 %rd5334, %rd5329, 4; add.s64 %rd5335, %rd5331, 4; add.s64 %rd5336, %rd5333, 4; selp.b64 %rd98, %rd5329, %rd5334, %p161; selp.b64 %rd11402, %rd5331, %rd5335, %p161; selp.b64 %rd11403, %rd5333, %rd5336, %p161; selp.b64 %rd11398, %rd5328, %rd11398, %p160; selp.b64 %rd11399, %rd5330, %rd11399, %p160; selp.b64 %rd11400, %rd5332, %rd11400, %p160; add.s64 %rd5337, %rd11401, 8; selp.b64 %rd11397, %rd5337, %rd11397, %p160; add.s64 %rd5338, %rd11407, 8; setp.eq.s64 %p162, %rd11404, %rd11410; selp.b64 %rd5339, %rd5338, %rd11404, %p162; add.s64 %rd5340, %rd11408, 8; selp.b64 %rd5341, %rd5340, %rd11405, %p162; add.s64 %rd5342, %rd11409, 8; selp.b64 %rd5343, %rd5342, %rd11406, %p162; selp.b64 %rd11407, %rd5338, %rd11407, %p162; selp.b64 %rd11408, %rd5340, %rd11408, %p162; selp.b64 %rd11409, %rd5342, %rd11409, %p162; add.s64 %rd5344, %rd11404, 8; selp.b64 %rd11410, %rd5344, %rd11410, %p162; add.s64 %rd5345, %rd5339, 4; add.s64 %rd5346, %rd5341, 4; add.s64 %rd5347, %rd5343, 4; selp.b64 %rd11404, %rd5339, %rd5345, %p161; selp.b64 %rd11405, %rd5341, %rd5346, %p161; selp.b64 %rd11406, %rd5343, %rd5347, %p161; ld.local.f32 %f1637, [%rd5341]; ld.local.f32 %f1638, [%rd5331]; setp.eq.f32 %p163, %f1638, %f1637; mov.u64 %rd11401, %rd98; @%p163 bra $L__BB1_43; bra.uni $L__BB1_45; $L__BB1_46: ld.u32 %rd5348, [%rd69]; ld.u32 %rd5349, [%rd69+4]; bfi.b64 %rd5350, %rd5349, %rd5348, 32, 32; cvt.u32.u64 %r1699, %rd5350; mov.b32 %f1639, %r1699; shr.u64 %rd5351, %rd5350, 32; cvt.u32.u64 %r1700, %rd5351; mov.b32 %f1640, %r1700; ld.u32 %rd5352, [%rd81]; ld.u32 %rd5353, [%rd81+4]; bfi.b64 %rd5354, %rd5353, %rd5352, 32, 32; cvt.u32.u64 %r1701, %rd5354; shr.u64 %rd5355, %rd5354, 32; cvt.u32.u64 %r1702, %rd5355; mov.b32 %f1641, %r1701; sub.f32 %f5317, %f1641, %f1639; mov.b32 %f1642, %r1702; sub.f32 %f5318, %f1642, %f1640; $L__BB1_57: mul.f32 %f1651, %f41, %f5318; fma.rn.f32 %f48, %f40, %f5317, %f1651; mul.f32 %f1652, %f5318, %f5318; fma.rn.f32 %f1653, %f5317, %f5317, %f1652; add.f32 %f1654, %f1653, 0f00000000; sqrt.rn.f32 %f1655, %f1654; mul.f32 %f1656, %f1655, 0f3A83126F; abs.f32 %f1657, %f48; setp.gt.f32 %p172, %f1657, %f1656; @%p172 bra $L__BB1_59; bra.uni $L__BB1_58; $L__BB1_59: setp.ge.f32 %p2908, %f48, 0f00000000; bra.uni $L__BB1_62; $L__BB1_58: ld.local.u64 %rd5394, [%rd45+8]; cvt.u32.u64 %r1710, %rd5394; mov.b32 %f1658, %r1710; shr.u64 %rd5395, %rd5394, 32; cvt.u32.u64 %r1711, %rd5395; mov.b32 %f1659, %r1711; sub.f32 %f1660, %f2, %f1658; sub.f32 %f1661, %f3, %f1659; mul.f32 %f1662, %f41, %f1661; fma.rn.f32 %f1663, %f40, %f1660, %f1662; setp.le.f32 %p2908, %f1663, 0f00000000; $L__BB1_62: selp.u16 %rs212, 1, 0, %p2908; st.local.u8 [%rd45+16], %rs212; $L__BB1_63: ld.local.v2.u32 {%r4784, %r4785}, [%rd45+8]; ld.local.u32 %r4786, [%rd45+16]; $L__BB1_65: setp.eq.s32 %p173, %r23, 2; mov.u64 %rd5403, 0; mov.u64 %rd11428, 2; mov.u64 %rd11429, %rd5403; @%p173 bra $L__BB1_67; setp.ne.s16 %p174, %rs2, 0; cvt.u16.u32 %rs214, %r4786; selp.u16 %rs215, 1, 0, %p174; xor.b16 %rs216, %rs214, %rs215; mov.b32 %f1670, %r4784; mov.b32 %f1671, %r4785; mul.f32 %f1672, %f11, %f1670; ld.global.f32 %f1673, [%rd35+252]; mul.f32 %f1674, %f1673, %f1671; sub.f32 %f1675, %f1672, %f1674; mul.f32 %f1676, %f1673, %f1670; fma.rn.f32 %f1677, %f11, %f1671, %f1676; add.f32 %f1678, %f9, %f1675; mov.b32 %r1716, %f1678; add.f32 %f1679, %f10, %f1677; mov.b32 %r1717, %f1679; cvt.u64.u32 %rd5404, %r1717; cvt.u64.u32 %rd5405, %r1716; cvt.u64.u16 %rd5406, %rs216; bfi.b64 %rd11429, %rd5404, %rd5405, 32, 32; and.b64 %rd5407, %rd5406, 255; mov.b64 {%r1718, %r1719}, %rd5407; mov.b32 {%rs217, %rs218}, %r1718; cvt.u64.u16 %rd11428, %rs217; $L__BB1_67: or.b64 %rd5408, %rd5403, %rd5403; or.b64 %rd5409, %rd11428, %rd5403; or.b64 %rd5410, %rd5409, %rd5403; or.b64 %rd5411, %rd5408, %rd11429; mov.b64 {%r4846, %r4847}, %rd5411; mov.b64 {%r4848, %r1720}, %rd5410; $L__BB1_235: add.s64 %rd11381, %rd35, 280; mov.b32 {%rs23, %rs286}, %r4848; and.b16 %rs287, %rs23, 255; setp.eq.s16 %p419, %rs287, 2; add.s64 %rd11388, %rd38, 1; @%p419 bra $L__BB1_5; add.s64 %rd11381, %rd35, 280; mov.b64 %rd5874, {%r4846, %r4847}; mov.b32 %f1987, %r4846; shr.u64 %rd5875, %rd5874, 32; cvt.u32.u64 %r1968, %rd5875; mov.b32 %f1988, %r1968; sub.f32 %f174, %f2, %f1987; sub.f32 %f175, %f3, %f1988; mul.f32 %f1989, %f175, %f175; fma.rn.f32 %f1990, %f174, %f174, %f1989; add.f32 %f1991, %f1990, 0f00000000; sqrt.rn.f32 %f5312, %f1991; setp.geu.f32 %p420, %f5312, %f8; @%p420 bra $L__BB1_5; bra.uni $L__BB1_237; $L__BB1_238: and.b16 %rs1014, %rs1, 255; setp.eq.s16 %p2906, %rs1014, 2; @%p2906 bra $L__BB1_240; bra.uni $L__BB1_239; $L__BB1_240: mov.u64 %rd5879, 3; st.global.u64 [%rd13+20], %rd5879; bra.uni $L__BB1_241; $L__BB1_239: and.b16 %rs289, %rs1, 1; setp.eq.b16 %p422, %rs289, 1; selp.b64 %rd5878, 1, 2, %p422; st.global.u64 [%rd13+20], %rd5878; st.global.u64 [%rd13+28], %rd29; st.global.u64 [%rd13+44], %rd30; $L__BB1_241: ld.param.u64 %rd11373, [grid_update_param_3]; ld.param.f32 %f5235, [grid_update_param_1]; cvta.to.global.u64 %rd1182, %rd5233; mul.f32 %f177, %f5235, 0f3DCCCCCD; mul.f32 %f178, %f177, 0f00000000; add.f32 %f179, %f177, %f2; add.f32 %f180, %f178, %f3; setp.eq.s64 %p423, %rd11373, 0; mov.u32 %r4922, 2; mov.u64 %rd607, 0; mov.u64 %rd1183, %rd5233; mov.u64 %rd11756, %rd607; @%p423 bra $L__BB1_478; ld.param.u64 %rd607, [grid_update_param_3]; add.u64 %rd5883, %SP, 560; add.u64 %rd5884, %SPL, 560; add.s64 %rd579, %rd5884, 8; add.u64 %rd5887, %SP, 0; add.u64 %rd5888, %SPL, 0; add.s64 %rd580, %rd5888, 8; add.s64 %rd581, %rd5888, 8; add.s64 %rd582, %rd5888, 8; add.s64 %rd583, %rd5888, 8; add.s64 %rd584, %rd5888, 8; add.s64 %rd585, %rd5888, 8; add.u64 %rd5899, %SP, 552; add.u64 %rd5900, %SPL, 552; add.s64 %rd586, %rd5900, 8; add.u64 %rd5901, %SP, 32; add.u64 %rd5902, %SPL, 32; add.s64 %rd587, %rd5902, 36; add.s64 %rd588, %rd5902, 4; add.s64 %rd589, %rd5901, 36; add.s64 %rd590, %rd5902, 44; add.s64 %rd591, %rd5901, 44; add.s64 %rd592, %rd5902, 52; add.s64 %rd593, %rd5884, 8; add.s64 %rd594, %rd5884, 8; or.b64 %rd595, %rd5883, 8; add.s64 %rd596, %rd5884, 16; add.s64 %rd597, %rd7, 32; add.s64 %rd598, %rd7, 48; add.s64 %rd599, %rd7, 64; add.s64 %rd600, %rd7, 80; add.s64 %rd601, %rd7, 96; add.s64 %rd602, %rd7, 112; cvta.to.global.u64 %rd11570, %rd5233; mov.u64 %rd11571, %rd5233; $L__BB1_243: mov.u64 %rd606, %rd11571; mov.u64 %rd605, %rd11570; add.s64 %rd607, %rd607, -1; setp.eq.s64 %p424, %rd606, 0; @%p424 bra $L__BB1_477; add.s64 %rd608, %rd605, 272; ld.global.u32 %r1977, [%rd605+272]; mov.u64 %rd11756, 0; setp.eq.s32 %p425, %r1977, 3; mov.u32 %r4921, 2; @%p425 bra $L__BB1_474; ld.global.u16 %rs290, [%rd608+-272]; setp.eq.s16 %p426, %rs290, 1; @%p426 bra $L__BB1_416; setp.eq.s16 %p427, %rs290, 2; @%p427 bra $L__BB1_305; setp.ne.s16 %p428, %rs290, 3; @%p428 bra $L__BB1_454; ld.global.u8 %rs25, [%rd608+-248]; ld.global.f32 %f181, [%rd608+-16]; sub.f32 %f1992, %f179, %f181; ld.global.f32 %f182, [%rd608+-12]; sub.f32 %f1993, %f180, %f182; ld.global.f32 %f1994, [%rd608+-20]; ld.global.f32 %f183, [%rd608+-24]; mul.f32 %f1995, %f1993, %f1994; fma.rn.f32 %f184, %f1992, %f183, %f1995; mul.f32 %f1996, %f1992, %f1994; mul.f32 %f1997, %f1993, %f183; sub.f32 %f185, %f1997, %f1996; cvta.to.local.u64 %rd609, %rd5901; mov.u32 %r197, 2; st.local.u32 [%rd609+20], %r197; ld.global.u64 %rd610, [%rd608+-256]; setp.eq.s64 %p429, %rd610, 0; @%p429 bra $L__BB1_302; mov.b32 %r1992, %f185; ld.global.u64 %rd611, [%rd608+-264]; mov.b32 %r1993, %f184; and.b32 %r1994, %r1993, 2147483647; mov.b32 %f186, %r1994; and.b32 %r1995, %r1992, 2147483647; mov.b32 %f187, %r1995; mov.u64 %rd11573, 1; bra.uni $L__BB1_250; $L__BB1_258: sub.f32 %f2009, %f5333, %f184; abs.f32 %f202, %f2009; setp.le.f32 %p439, %f202, 0f34000000; @%p439 bra $L__BB1_260; abs.f32 %f2010, %f5333; abs.f32 %f2011, %f184; setp.gt.f32 %p441, %f2011, %f2010; selp.f32 %f2012, %f2011, %f2010, %p441; mul.f32 %f2013, %f2012, 0f34000000; setp.gtu.f32 %p442, %f202, %f2013; @%p442 bra $L__BB1_264; bra.uni $L__BB1_260; $L__BB1_250: shl.b64 %rd5912, %rd11573, 3; add.s64 %rd5913, %rd611, %rd5912; setp.eq.s64 %p430, %rd11573, %rd610; selp.b64 %rd5914, 0, %rd11573, %p430; shl.b64 %rd5915, %rd5914, 3; add.s64 %rd5916, %rd611, %rd5915; ld.u32 %rd5917, [%rd5913+-8]; ld.u32 %rd5918, [%rd5913+-4]; bfi.b64 %rd614, %rd5918, %rd5917, 32, 32; ld.u32 %rd5919, [%rd5916]; ld.u32 %rd5920, [%rd5916+4]; bfi.b64 %rd615, %rd5920, %rd5919, 32, 32; cvt.u32.u64 %r4850, %rd614; mov.b32 %f5333, %r4850; shr.u64 %rd5921, %rd614, 32; cvt.u32.u64 %r1998, %rd5921; mov.b32 %f190, %r1998; cvt.u32.u64 %r181, %rd615; shr.u64 %rd5922, %rd615, 32; cvt.u32.u64 %r1999, %rd5922; mov.b32 %f191, %r181; sub.f32 %f192, %f191, %f5333; mov.b32 %f1999, %r1999; sub.f32 %f193, %f1999, %f190; sub.f32 %f2000, %f184, %f5333; sub.f32 %f2001, %f185, %f190; mul.f32 %f2002, %f193, %f2001; fma.rn.f32 %f194, %f192, %f2000, %f2002; mul.f32 %f2003, %f193, %f193; fma.rn.f32 %f2004, %f192, %f192, %f2003; add.f32 %f195, %f2004, 0f00000000; setp.gtu.f32 %p431, %f194, 0f00000000; mov.b64 {%r2000, %r4851}, %rd614; mov.b64 {%r2001, %r183}, %rd615; @%p431 bra $L__BB1_252; bra.uni $L__BB1_251; $L__BB1_252: setp.ltu.f32 %p432, %f194, %f195; @%p432 bra $L__BB1_254; bra.uni $L__BB1_253; $L__BB1_254: setp.eq.f32 %p433, %f195, 0f00000000; @%p433 bra $L__BB1_301; div.rn.f32 %f2005, %f194, %f195; mov.f32 %f2006, 0f3F800000; sub.f32 %f2007, %f2006, %f2005; mov.b32 %r4853, %f2007; mov.b32 %r4854, %f2005; fma.rn.f32 %f5333, %f192, %f2005, %f5333; mov.b32 %r4850, %f5333; fma.rn.f32 %f5334, %f193, %f2005, %f190; mov.b32 %r4851, %f5334; mov.u32 %r4852, 1; bra.uni $L__BB1_256; $L__BB1_251: mov.b32 %f5334, %r4851; mov.u32 %r4852, 0; mov.u32 %r4853, %r4852; bra.uni $L__BB1_256; $L__BB1_253: mov.b32 %f5334, %r183; mov.u32 %r4853, 1; mov.u32 %r4852, 0; mov.f32 %f5333, %f191; mov.u32 %r4850, %r181; mov.u32 %r4851, %r183; $L__BB1_256: setp.eq.f32 %p434, %f184, %f5333; @%p434 bra $L__BB1_260; bra.uni $L__BB1_257; $L__BB1_260: setp.eq.f32 %p444, %f5334, %f185; mov.pred %p443, -1; mov.pred %p2913, %p443; @%p444 bra $L__BB1_264; setp.eq.f32 %p446, %f187, 0f7F800000; and.b32 %r2010, %r4851, 2147483647; mov.b32 %f2014, %r2010; setp.eq.f32 %p447, %f2014, 0f7F800000; or.pred %p448, %p446, %p447; mov.pred %p2913, 0; @%p448 bra $L__BB1_264; sub.f32 %f2015, %f5334, %f185; abs.f32 %f203, %f2015; setp.le.f32 %p450, %f203, 0f34000000; mov.pred %p2913, %p443; @%p450 bra $L__BB1_264; abs.f32 %f2016, %f5334; abs.f32 %f2017, %f185; setp.gt.f32 %p451, %f2017, %f2016; selp.f32 %f2018, %f2017, %f2016, %p451; mul.f32 %f2019, %f2018, 0f34000000; setp.le.f32 %p2913, %f203, %f2019; bra.uni $L__BB1_264; $L__BB1_257: setp.eq.f32 %p436, %f186, 0f7F800000; and.b32 %r2009, %r4850, 2147483647; mov.b32 %f2008, %r2009; setp.eq.f32 %p437, %f2008, 0f7F800000; or.pred %p438, %p436, %p437; mov.pred %p2913, 0; @%p438 bra $L__BB1_264; bra.uni $L__BB1_258; $L__BB1_264: cvt.u64.u32 %rd5923, %r4851; cvt.u64.u32 %rd5924, %r4850; bfi.b64 %rd616, %rd5923, %rd5924, 32, 32; mov.b64 {%r2011, %r2012}, %rd616; selp.u64 %rd617, 1, 0, %p2913; mov.b32 %f205, %r2012; mov.b32 %f204, %r2011; sub.f32 %f2020, %f204, %f184; sub.f32 %f2021, %f205, %f185; mul.f32 %f2022, %f2021, %f2021; fma.rn.f32 %f2023, %f2020, %f2020, %f2022; add.f32 %f2024, %f2023, 0f00000000; sqrt.rn.f32 %f207, %f2024; setp.geu.f32 %p452, %f207, %f5335; setp.ne.s32 %p453, %r197, 2; and.pred %p454, %p453, %p452; @%p454 bra $L__BB1_266; add.s64 %rd11574, %rd11573, -1; st.local.u64 [%rd609], %rd11574; st.local.v2.f32 [%rd609+8], {%f204, %f205}; mov.b64 {%r2015, %r2016}, %rd617; st.local.v2.u32 [%rd609+16], {%r2015, %r4852}; st.local.v2.u32 [%rd609+24], {%r4853, %r4854}; st.local.f32 [%rd609+32], %f207; st.local.u32 [%rd609+36], %rd614; st.local.u32 [%rd609+44], %rd615; st.local.u32 [%rd609+40], %rd5921; st.local.u32 [%rd609+48], %rd5922; mov.f32 %f5335, %f207; mov.u32 %r197, %r4852; $L__BB1_266: add.s64 %rd620, %rd11573, 1; setp.lt.u64 %p455, %rd11573, %rd610; mov.u64 %rd11573, %rd620; @%p455 bra $L__BB1_250; ld.local.u32 %rd5931, [%rd609+36]; ld.local.u32 %rd5932, [%rd609+40]; bfi.b64 %rd5933, %rd5932, %rd5931, 32, 32; mov.u64 %rd5930, 0; cvt.u32.u64 %r2017, %rd5933; mov.b32 %f2025, %r2017; shr.u64 %rd5934, %rd5933, 32; cvt.u32.u64 %r2018, %rd5934; mov.b32 %f2026, %r2018; ld.local.u32 %rd5935, [%rd609+44]; ld.local.u32 %rd5936, [%rd609+48]; bfi.b64 %rd5937, %rd5936, %rd5935, 32, 32; cvt.u32.u64 %r2019, %rd5937; shr.u64 %rd5938, %rd5937, 32; cvt.u32.u64 %r2020, %rd5938; mov.b32 %f2027, %r2019; sub.f32 %f209, %f2027, %f2025; mov.b32 %f2028, %r2020; sub.f32 %f210, %f2028, %f2026; mul.f32 %f2029, %f210, %f210; fma.rn.f32 %f2030, %f209, %f209, %f2029; add.f32 %f211, %f2030, 0f00000000; setp.leu.f32 %p456, %f211, 0f28800000; mov.u64 %rd11575, %rd5930; mov.u64 %rd11576, %rd5930; mov.u64 %rd11577, %rd5930; @%p456 bra $L__BB1_269; neg.f32 %f2031, %f209; sqrt.rn.f32 %f2032, %f211; div.rn.f32 %f2033, %f210, %f2032; div.rn.f32 %f2034, %f2031, %f2032; mov.b32 %r2021, %f2034; mov.b32 %r2022, %f2033; mov.u64 %rd11577, 1; mov.b64 %rd5941, {%r2022, %r2021}; shr.u64 %rd11576, %rd5941, 32; shl.b64 %rd11575, %rd5941, 32; $L__BB1_269: or.b64 %rd627, %rd11577, %rd11575; or.b64 %rd628, %rd5930, %rd11576; and.b64 %rd5942, %rd5930, 4294967295; xor.b64 %rd5943, %rd11577, 1; or.b64 %rd5944, %rd5943, %rd5942; setp.ne.s64 %p457, %rd5944, 0; @%p457 bra $L__BB1_300; mov.b64 {%r2023, %r2024}, %rd628; mov.b64 {%r2025, %r2026}, %rd627; mov.b32 %f212, %r2026; mov.b32 %f213, %r2023; setp.eq.s32 %p458, %r197, 1; @%p458 bra $L__BB1_298; bra.uni $L__BB1_271; $L__BB1_298: ld.local.u64 %rd6023, [%rd609+8]; cvt.u32.u64 %r2047, %rd6023; mov.b32 %f2062, %r2047; shr.u64 %rd6024, %rd6023, 32; cvt.u32.u64 %r2048, %rd6024; mov.b32 %f2063, %r2048; sub.f32 %f2064, %f179, %f2062; sub.f32 %f2065, %f180, %f2063; mul.f32 %f2066, %f213, %f2065; fma.rn.f32 %f2067, %f212, %f2064, %f2066; setp.le.f32 %p2914, %f2067, 0f00000000; bra.uni $L__BB1_299; $L__BB1_305: ld.global.f32 %f2078, [%rd608+-16]; mov.u64 %rd11731, 0; sub.f32 %f2079, %f179, %f2078; ld.global.f32 %f2080, [%rd608+-12]; sub.f32 %f2081, %f180, %f2080; ld.global.f32 %f2082, [%rd608+-20]; ld.global.f32 %f2083, [%rd608+-24]; mul.f32 %f2084, %f2081, %f2082; fma.rn.f32 %f221, %f2079, %f2083, %f2084; mul.f32 %f2085, %f2079, %f2082; mul.f32 %f2086, %f2081, %f2083; sub.f32 %f222, %f2086, %f2085; mov.b32 %r2056, %f221; mov.b32 %r2057, %f222; cvt.u64.u32 %rd6045, %r2057; cvt.u64.u32 %rd6046, %r2056; bfi.b64 %rd6047, %rd6045, %rd6046, 32, 32; st.local.u64 [%rd5900], %rd6047; ld.global.u64 %rd730, [%rd608+-240]; setp.eq.s64 %p479, %rd730, 0; mov.u64 %rd11732, 2; mov.u64 %rd11733, %rd11731; @%p479 bra $L__BB1_411; cvta.to.local.u64 %rd731, %rd5901; mov.u32 %r2064, 0; st.local.u32 [%rd731], %r2064; mov.u32 %r2065, -16777217; st.local.u32 [%rd731+4], %r2065; mov.u32 %r219, 1; st.local.u32 [%rd731+512], %r219; ld.global.u64 %rd732, [%rd608+-248]; ld.global.u64 %rd733, [%rd608+-192]; ld.global.u64 %rd734, [%rd608+-200]; mov.u32 %r217, 2139095039; mov.u32 %r216, 4; bra.uni $L__BB1_308; $L__BB1_416: ld.global.f32 %f288, [%rd608+-16]; sub.f32 %f2282, %f179, %f288; ld.global.f32 %f289, [%rd608+-12]; sub.f32 %f2283, %f180, %f289; ld.global.f32 %f2284, [%rd608+-20]; ld.global.f32 %f290, [%rd608+-24]; mul.f32 %f2285, %f2283, %f2284; fma.rn.f32 %f291, %f2282, %f290, %f2285; mul.f32 %f2286, %f2282, %f2284; mul.f32 %f2287, %f2283, %f290; sub.f32 %f292, %f2287, %f2286; mov.b32 %r320, %f291; mov.b32 %r321, %f292; ld.global.u64 %rd1093, [%rd608+-216]; ld.global.u64 %rd1092, [%rd608+-224]; sub.f32 %f2288, %f291, %f6; sub.f32 %f2289, %f292, %f6; mov.b32 %r2236, %f2288; mov.b32 %r2237, %f2289; cvt.u64.u32 %rd6434, %r2237; cvt.u64.u32 %rd6435, %r2236; add.f32 %f2290, %f6, %f291; add.f32 %f2291, %f6, %f292; mov.b32 %r2238, %f2290; mov.b32 %r2239, %f2291; cvt.u64.u32 %rd6436, %r2239; cvt.u64.u32 %rd6437, %r2238; bfi.b64 %rd6438, %rd6434, %rd6435, 32, 32; mov.b64 {%r2240, %r2241}, %rd6438; bfi.b64 %rd6439, %rd6436, %rd6437, 32, 32; mov.b64 {%r2242, %r2243}, %rd6439; cvta.to.local.u64 %rd1094, %rd5901; mov.u16 %rs356, 2; st.local.u8 [%rd1094+8], %rs356; mov.b32 %f296, %r2243; mov.b32 %f294, %r2241; mov.b32 %f295, %r2242; mov.b32 %f293, %r2240; ld.global.v2.f32 {%f2292, %f2293}, [%rd608+-232]; div.rn.f32 %f299, %f293, %f2292; div.rn.f32 %f300, %f295, %f2292; ld.global.u64 %rd1095, [%rd608+-256]; cvt.rn.f32.u64 %f2294, %rd1095; add.f32 %f2295, %f2294, 0fBF800000; rcp.rn.f32 %f301, %f2295; setp.lt.f32 %p646, %f300, 0fBF000000; setp.gt.f32 %p647, %f299, 0f3F000000; or.pred %p648, %p647, %p646; @%p648 bra $L__BB1_448; add.f32 %f2296, %f299, 0f3F000000; div.rn.f32 %f2297, %f2296, %f301; cvt.rmi.f32.f32 %f2298, %f2297; add.s64 %rd6441, %rd1095, -2; cvt.rn.f32.u64 %f2299, %rd6441; setp.gt.f32 %p649, %f2298, 0f00000000; setp.lt.f32 %p650, %f2298, %f2299; selp.f32 %f2300, %f2298, %f2299, %p650; selp.f32 %f2301, %f2300, 0f00000000, %p649; setp.gt.f32 %p651, %f2301, 0f5F7FFFFF; max.f32 %f2302, %f2301, 0f00000000; cvt.rzi.u64.f32 %rd6442, %f2302; selp.b64 %rd1101, -1, %rd6442, %p651; add.f32 %f2303, %f300, 0f3F000000; div.rn.f32 %f2304, %f2303, %f301; cvt.rpi.f32.f32 %f2305, %f2304; add.s64 %rd6443, %rd1095, -1; cvt.rn.f32.u64 %f2306, %rd6443; setp.gt.f32 %p652, %f2305, 0f00000000; setp.lt.f32 %p653, %f2305, %f2306; selp.f32 %f2307, %f2305, %f2306, %p653; selp.f32 %f2308, %f2307, 0f00000000, %p652; setp.gt.f32 %p654, %f2308, 0f5F7FFFFF; max.f32 %f2309, %f2308, 0f00000000; cvt.rzi.u64.f32 %rd6444, %f2309; selp.b64 %rd1097, -1, %rd6444, %p654; setp.ge.u64 %p655, %rd1101, %rd1097; @%p655 bra $L__BB1_448; div.rn.f32 %f302, %f294, %f2293; div.rn.f32 %f303, %f296, %f2293; ld.global.u64 %rd1098, [%rd608+-240]; ld.global.u64 %rd1099, [%rd608+-248]; ld.global.u64 %rd1100, [%rd608+-264]; and.b32 %r2244, %r320, 2147483647; mov.b32 %f304, %r2244; and.b32 %r2245, %r321, 2147483647; mov.b32 %f305, %r2245; ld.local.v4.u32 {%r4915, %r4916, %r4917, %r2249}, [%rd1094]; mov.f32 %f5347, 0f7F7FFFFF; bra.uni $L__BB1_419; $L__BB1_454: ld.global.f32 %f330, [%rd608+-16]; sub.f32 %f2351, %f179, %f330; ld.global.f32 %f331, [%rd608+-12]; sub.f32 %f2352, %f180, %f331; ld.global.f32 %f332, [%rd608+-20]; ld.global.f32 %f333, [%rd608+-24]; mul.f32 %f2353, %f2352, %f332; fma.rn.f32 %f334, %f2351, %f333, %f2353; mul.f32 %f2354, %f2351, %f332; mul.f32 %f2355, %f2352, %f333; sub.f32 %f335, %f2355, %f2354; ld.global.u32 %rd6471, [%rd608+-264]; ld.global.u32 %rd6472, [%rd608+-260]; bfi.b64 %rd6473, %rd6472, %rd6471, 32, 32; cvt.u32.u64 %r2288, %rd6473; mov.b32 %f2356, %r2288; shr.u64 %rd6474, %rd6473, 32; cvt.u32.u64 %r2289, %rd6474; mov.b32 %f2357, %r2289; neg.f32 %f2358, %f2356; neg.f32 %f2359, %f2357; sub.f32 %f336, %f2358, %f334; sub.f32 %f337, %f2359, %f335; sub.f32 %f338, %f334, %f2356; sub.f32 %f339, %f335, %f2357; setp.ge.f32 %p704, %f336, 0f00000000; selp.f32 %f2360, %f336, 0f00000000, %p704; setp.ge.f32 %p705, %f337, 0f00000000; selp.f32 %f2361, %f337, 0f00000000, %p705; setp.ge.f32 %p706, %f338, 0f00000000; selp.f32 %f2362, %f338, 0f00000000, %p706; setp.ge.f32 %p707, %f339, 0f00000000; selp.f32 %f2363, %f339, 0f00000000, %p707; sub.f32 %f340, %f2360, %f2362; mov.b32 %r2290, %f340; sub.f32 %f341, %f2361, %f2363; mov.b32 %r2291, %f341; cvt.u64.u32 %rd6475, %r2291; cvt.u64.u32 %rd6476, %r2290; bfi.b64 %rd6477, %rd6475, %rd6476, 32, 32; st.local.u64 [%rd5884], %rd6477; mov.u64 %rd11747, 2; mov.u64 %rd11740, %rd579; mov.u64 %rd11741, %rd5884; mov.u64 %rd11742, %rd5884; mov.u64 %rd11743, %rd5883; mov.u64 %rd11744, %rd5884; mov.u64 %rd11745, %rd5884; mov.u64 %rd11746, %rd5883; $L__BB1_455: setp.eq.s64 %p708, %rd11747, 0; @%p708 bra $L__BB1_458; add.s64 %rd11747, %rd11747, -1; add.s64 %rd6478, %rd11744, 8; setp.eq.s64 %p709, %rd11744, %rd11740; selp.b64 %rd11740, %rd6478, %rd11740, %p709; add.s64 %rd6479, %rd11741, 8; selp.b64 %rd11741, %rd6479, %rd11741, %p709; add.s64 %rd6480, %rd11742, 8; selp.b64 %rd11742, %rd6480, %rd11742, %p709; add.s64 %rd6481, %rd11743, 8; selp.b64 %rd11743, %rd6481, %rd11743, %p709; selp.b64 %rd6482, %rd6479, %rd11744, %p709; selp.b64 %rd6483, %rd6480, %rd11745, %p709; selp.b64 %rd6484, %rd6481, %rd11746, %p709; setp.eq.s64 %p710, %rd11747, 0; add.s64 %rd6485, %rd6482, 4; add.s64 %rd6486, %rd6483, 4; add.s64 %rd6487, %rd6484, 4; selp.b64 %rd11744, %rd6482, %rd6485, %p710; selp.b64 %rd11745, %rd6483, %rd6486, %p710; selp.b64 %rd11746, %rd6484, %rd6487, %p710; ld.local.f32 %f2364, [%rd6483]; setp.eq.f32 %p711, %f2364, 0f00000000; @%p711 bra $L__BB1_455; add.f32 %f2365, %f334, %f340; mov.b32 %r2292, %f2365; add.f32 %f2366, %f335, %f341; mov.b32 %r2293, %f2366; cvt.u64.u32 %rd6490, %r2293; cvt.u64.u32 %rd6491, %r2292; bfi.b64 %rd11750, %rd6490, %rd6491, 32, 32; mov.u64 %rd11751, 0; bra.uni $L__BB1_471; $L__BB1_458: setp.lt.f32 %p712, %f336, %f338; mov.f32 %f5348, 0fFF7FFFFF; @%p712 bra $L__BB1_461; bra.uni $L__BB1_459; $L__BB1_461: setp.leu.f32 %p717, %f338, 0fFF7FFFFF; mov.pred %p2918, 0; @%p717 bra $L__BB1_463; mov.f32 %f5348, %f338; bra.uni $L__BB1_463; $L__BB1_459: setp.leu.f32 %p714, %f336, 0fFF7FFFFF; mov.pred %p2918, 0; @%p714 bra $L__BB1_463; mov.pred %p2918, -1; mov.f32 %f5348, %f336; $L__BB1_463: setp.lt.f32 %p719, %f337, %f339; @%p719 bra $L__BB1_466; bra.uni $L__BB1_464; $L__BB1_466: setp.gt.f32 %p721, %f339, %f5348; @%p721 bra $L__BB1_469; bra.uni $L__BB1_467; $L__BB1_469: cvta.to.local.u64 %rd6498, %rd5901; mov.u64 %rd6499, 0; st.local.u64 [%rd6498], %rd6499; neg.f32 %f5350, %f339; mov.u64 %rd11749, %rd588; bra.uni $L__BB1_470; $L__BB1_464: setp.leu.f32 %p720, %f337, %f5348; @%p720 bra $L__BB1_467; mov.u64 %rd6494, 0; st.local.u64 [%rd5902], %rd6494; mov.u64 %rd11749, %rd588; mov.f32 %f5348, %f337; bra.uni $L__BB1_468; $L__BB1_467: mov.u64 %rd6496, 0; st.local.u64 [%rd5902], %rd6496; neg.f32 %f5350, %f5348; not.pred %p722, %p2918; mov.u64 %rd11749, %rd5902; @%p722 bra $L__BB1_470; $L__BB1_468: mov.f32 %f5350, %f5348; $L__BB1_470: st.local.f32 [%rd11749], %f5350; ld.local.u64 %rd6504, [%rd5902]; cvt.u32.u64 %r2294, %rd6504; mov.b32 %f2369, %r2294; shr.u64 %rd6505, %rd6504, 32; cvt.u32.u64 %r2295, %rd6505; mov.b32 %f2370, %r2295; add.f32 %f2371, %f334, %f2369; add.f32 %f2372, %f335, %f2370; mov.b32 %r2296, %f2371; mov.b32 %r2297, %f2372; cvt.u64.u32 %rd6506, %r2297; cvt.u64.u32 %rd6507, %r2296; bfi.b64 %rd11750, %rd6506, %rd6507, 32, 32; mov.u64 %rd11751, 1; $L__BB1_471: mov.u64 %rd11069, 0; cvt.u32.u64 %r2298, %rd11750; mov.b32 %f2373, %r2298; shr.u64 %rd6508, %rd11750, 32; cvt.u32.u64 %r2299, %rd6508; mov.b32 %f2374, %r2299; mul.f32 %f2375, %f333, %f2373; mul.f32 %f2376, %f332, %f2374; sub.f32 %f2377, %f2375, %f2376; mul.f32 %f2378, %f333, %f2374; fma.rn.f32 %f2379, %f332, %f2373, %f2378; add.f32 %f2380, %f330, %f2377; mov.b32 %r2300, %f2380; add.f32 %f2381, %f331, %f2379; mov.b32 %r2301, %f2381; cvt.u64.u32 %rd6509, %r2301; cvt.u64.u32 %rd6510, %r2300; bfi.b64 %rd6511, %rd6509, %rd6510, 32, 32; or.b64 %rd6512, %rd11069, %rd6511; mov.b64 {%r4918, %r4919}, %rd6512; mov.b64 {%r4920, %r2302}, %rd11751; bra.uni $L__BB1_472; $L__BB1_436: sub.f32 %f2322, %f5345, %f291; abs.f32 %f323, %f2322; setp.le.f32 %p674, %f323, 0f34000000; @%p674 bra $L__BB1_438; abs.f32 %f2323, %f5345; abs.f32 %f2324, %f291; setp.gt.f32 %p676, %f2324, %f2323; selp.f32 %f2325, %f2324, %f2323, %p676; mul.f32 %f2326, %f2325, 0f34000000; setp.gtu.f32 %p677, %f323, %f2326; @%p677 bra $L__BB1_442; bra.uni $L__BB1_438; $L__BB1_419: setp.gt.u64 %p656, %rd1098, %rd1101; @%p656 bra $L__BB1_421; bra.uni $L__BB1_420; $L__BB1_421: add.s64 %rd6445, %rd1099, %rd1101; ld.u8 %rs357, [%rd6445]; setp.eq.s16 %p657, %rs357, 0; @%p657 bra $L__BB1_446; cvt.rn.f32.u64 %f2311, %rd1101; fma.rn.f32 %f307, %f301, %f2311, 0fBF000000; setp.gt.u64 %p658, %rd1095, %rd1101; @%p658 bra $L__BB1_424; bra.uni $L__BB1_423; $L__BB1_424: shl.b64 %rd6446, %rd1101, 2; add.s64 %rd1102, %rd1100, %rd6446; ld.f32 %f308, [%rd1102]; add.s64 %rd6447, %rd1101, 1; setp.gt.u64 %p659, %rd1095, %rd6447; @%p659 bra $L__BB1_426; bra.uni $L__BB1_425; $L__BB1_426: ld.f32 %f309, [%rd1102+4]; setp.gt.f32 %p660, %f309, %f303; setp.gt.f32 %p661, %f308, %f303; and.pred %p662, %p661, %p660; @%p662 bra $L__BB1_446; setp.lt.f32 %p663, %f308, %f302; setp.lt.f32 %p664, %f309, %f302; and.pred %p665, %p663, %p664; @%p665 bra $L__BB1_446; mul.f32 %f2312, %f2292, %f307; mov.b32 %r2250, %f2312; mul.f32 %f312, %f2293, %f308; mov.b32 %r2251, %f312; cvt.u64.u32 %rd6448, %r2251; cvt.u64.u32 %rd6449, %r2250; add.f32 %f2313, %f301, %f307; mul.f32 %f310, %f2292, %f2313; mov.b32 %r328, %f310; mul.f32 %f2314, %f2293, %f309; mov.b32 %r2252, %f2314; cvt.u64.u32 %rd6450, %r2252; cvt.u64.u32 %rd6451, %r328; bfi.b64 %rd6452, %rd6450, %rd6451, 32, 32; bfi.b64 %rd6453, %rd6448, %rd6449, 32, 32; cvt.u32.u64 %r4913, %rd6453; mov.b32 %f5345, %r4913; sub.f32 %f313, %f310, %f5345; sub.f32 %f314, %f2314, %f312; sub.f32 %f2315, %f291, %f5345; sub.f32 %f2316, %f292, %f312; mul.f32 %f2317, %f314, %f2316; fma.rn.f32 %f315, %f313, %f2315, %f2317; mul.f32 %f2318, %f314, %f314; fma.rn.f32 %f2319, %f313, %f313, %f2318; add.f32 %f316, %f2319, 0f00000000; setp.gtu.f32 %p666, %f315, 0f00000000; mov.b64 {%r2253, %r4914}, %rd6453; mov.b64 {%r2254, %r331}, %rd6452; @%p666 bra $L__BB1_430; bra.uni $L__BB1_429; $L__BB1_430: setp.ltu.f32 %p667, %f315, %f316; @%p667 bra $L__BB1_432; bra.uni $L__BB1_431; $L__BB1_432: setp.eq.f32 %p668, %f316, 0f00000000; @%p668 bra $L__BB1_445; div.rn.f32 %f2320, %f315, %f316; fma.rn.f32 %f5345, %f313, %f2320, %f5345; mov.b32 %r4913, %f5345; fma.rn.f32 %f5346, %f314, %f2320, %f312; mov.b32 %r4914, %f5346; bra.uni $L__BB1_434; $L__BB1_429: mov.b32 %f5346, %r4914; bra.uni $L__BB1_434; $L__BB1_431: mov.b32 %f5346, %r331; mov.f32 %f5345, %f310; mov.u32 %r4913, %r328; mov.u32 %r4914, %r331; $L__BB1_434: setp.eq.f32 %p669, %f291, %f5345; @%p669 bra $L__BB1_438; bra.uni $L__BB1_435; $L__BB1_438: setp.eq.f32 %p679, %f5346, %f292; mov.pred %p678, -1; mov.pred %p2916, %p678; @%p679 bra $L__BB1_442; setp.eq.f32 %p681, %f305, 0f7F800000; and.b32 %r2256, %r4914, 2147483647; mov.b32 %f2327, %r2256; setp.eq.f32 %p682, %f2327, 0f7F800000; or.pred %p683, %p681, %p682; mov.pred %p2916, 0; @%p683 bra $L__BB1_442; sub.f32 %f2328, %f5346, %f292; abs.f32 %f324, %f2328; setp.le.f32 %p685, %f324, 0f34000000; mov.pred %p2916, %p678; @%p685 bra $L__BB1_442; abs.f32 %f2329, %f5346; abs.f32 %f2330, %f292; setp.gt.f32 %p686, %f2330, %f2329; selp.f32 %f2331, %f2330, %f2329, %p686; mul.f32 %f2332, %f2331, 0f34000000; setp.le.f32 %p2916, %f324, %f2332; bra.uni $L__BB1_442; $L__BB1_435: setp.eq.f32 %p671, %f304, 0f7F800000; and.b32 %r2255, %r4913, 2147483647; mov.b32 %f2321, %r2255; setp.eq.f32 %p672, %f2321, 0f7F800000; or.pred %p673, %p671, %p672; mov.pred %p2916, 0; @%p673 bra $L__BB1_442; bra.uni $L__BB1_436; $L__BB1_442: cvt.u64.u32 %rd6454, %r4914; cvt.u64.u32 %rd6455, %r4913; bfi.b64 %rd1103, %rd6454, %rd6455, 32, 32; mov.b64 {%r2257, %r2258}, %rd1103; selp.u64 %rd1104, 1, 0, %p2916; mov.b32 %f2333, %r2257; sub.f32 %f2334, %f2333, %f291; mov.b32 %f2335, %r2258; sub.f32 %f2336, %f2335, %f292; mul.f32 %f2337, %f2336, %f2336; fma.rn.f32 %f2338, %f2334, %f2334, %f2337; add.f32 %f325, %f2338, 0f00000000; setp.geu.f32 %p687, %f325, %f5347; @%p687 bra $L__BB1_446; sqrt.rn.f32 %f2339, %f325; setp.gtu.f32 %p688, %f2339, %f6; mov.f32 %f5347, %f325; @%p688 bra $L__BB1_446; mov.b64 {%r4917, %r2259}, %rd1104; mov.u32 %r4915, %r2257; mov.u32 %r4916, %r2258; mov.f32 %f5347, %f325; $L__BB1_446: add.s64 %rd1101, %rd1101, 1; setp.lt.u64 %p689, %rd1101, %rd1097; @%p689 bra $L__BB1_419; st.local.u32 [%rd1094+8], %r4917; mov.b64 %rd6456, {%r4915, %r4916}; st.local.u64 [%rd1094], %rd6456; $L__BB1_448: cvt.u64.u32 %rd6457, %r320; cvt.u64.u32 %rd6458, %r321; bfi.b64 %rd1106, %rd6458, %rd6457, 32, 32; ld.local.v4.u32 {%r2263, %r2264, %r2265, %r2266}, [%rd1094]; mov.b64 %rd1108, {%r2265, %r2266}; mov.b64 %rd1107, {%r2263, %r2264}; mov.b32 {%rs358, %rs359}, %r2265; and.b16 %rs360, %rs358, 255; setp.eq.s16 %p690, %rs360, 2; cvt.u64.u16 %rd6459, %rs358; and.b64 %rd6460, %rd6459, 255; selp.b64 %rd6461, 2, %rd6460, %p690; and.b64 %rd6462, %rd1108, 4294967040; or.b64 %rd6463, %rd6462, %rd6461; mov.b64 {%r2271, %r2272}, %rd6463; mov.b32 {%rs1021, %rs361}, %r2271; and.b16 %rs362, %rs1021, 255; setp.eq.s16 %p691, %rs362, 2; mov.u32 %r4920, 2; mov.u32 %r4918, 0; mov.u32 %r4919, %r4918; @%p691 bra $L__BB1_472; ld.global.u8 %rs363, [%rd608+-208]; setp.eq.s16 %p692, %rs363, 0; shr.u64 %rd6464, %rd1107, 32; cvt.u32.u64 %r2273, %rd6464; mov.b32 %f327, %r2273; @%p692 bra $L__BB1_453; mov.b64 {%r2274, %r2275}, %rd1106; mov.b32 %f329, %r2275; mov.b32 %f328, %r2274; mov.b64 {%r2276, %r2277}, %rd1092; mov.b64 {%r2278, %r2279}, %rd1093; ld.global.u8 %rs43, [%rd608+-207]; mov.b32 %f2340, %r2278; setp.gt.f32 %p694, %f328, %f2340; mov.b32 %f2341, %r2276; setp.lt.f32 %p695, %f328, %f2341; or.pred %p696, %p695, %p694; mov.pred %p2917, 0; @%p696 bra $L__BB1_452; setp.geu.f32 %p697, %f329, 0fFF7FFFFF; setp.leu.f32 %p698, %f329, 0f7F7FFFFF; and.pred %p2917, %p698, %p697; $L__BB1_452: setp.ge.f32 %p699, %f292, %f327; setp.le.f32 %p700, %f292, %f327; setp.eq.s16 %p701, %rs43, 0; selp.u32 %r2280, -1, 0, %p699; selp.u32 %r2281, -1, 0, %p700; selp.b32 %r2282, %r2281, %r2280, %p701; and.b32 %r2283, %r2282, 1; setp.eq.b32 %p702, %r2283, 1; and.pred %p703, %p702, %p2917; selp.u16 %rs1021, 1, 0, %p703; $L__BB1_453: cvt.u32.u64 %r2284, %rd1107; mov.b32 %f2342, %r2284; mul.f32 %f2343, %f290, %f2342; ld.global.f32 %f2344, [%rd608+-20]; mul.f32 %f2345, %f2344, %f327; sub.f32 %f2346, %f2343, %f2345; mul.f32 %f2347, %f2344, %f2342; fma.rn.f32 %f2348, %f290, %f327, %f2347; add.f32 %f2349, %f288, %f2346; mov.b32 %r2285, %f2349; add.f32 %f2350, %f289, %f2348; mov.b32 %r2286, %f2350; cvt.u64.u32 %rd6465, %r2286; cvt.u64.u32 %rd6466, %r2285; cvt.u64.u16 %rd6467, %rs1021; bfi.b64 %rd6468, %rd6465, %rd6466, 32, 32; and.b64 %rd6469, %rd6467, 255; mov.b64 {%r4918, %r4919}, %rd6468; mov.b64 {%r4920, %r2287}, %rd6469; bra.uni $L__BB1_472; $L__BB1_271: ld.local.u32 %r2027, [%rd609+24]; setp.eq.s32 %p459, %r2027, 0; @%p459 bra $L__BB1_284; setp.ne.s32 %p460, %r2027, 1; @%p460 bra $L__BB1_297; add.s64 %rd629, %rd11574, 1; or.b64 %rd5945, %rd629, %rd610; and.b64 %rd5946, %rd5945, -4294967296; setp.eq.s64 %p461, %rd5946, 0; @%p461 bra $L__BB1_275; rem.u64 %rd11578, %rd629, %rd610; bra.uni $L__BB1_276; $L__BB1_284: setp.eq.s64 %p468, %rd11574, 0; selp.b64 %rd676, %rd610, %rd11574, %p468; add.s64 %rd5985, %rd676, -1; setp.gt.u64 %p469, %rd610, %rd5985; @%p469 bra $L__BB1_286; bra.uni $L__BB1_285; $L__BB1_286: shl.b64 %rd5986, %rd676, 3; add.s64 %rd5987, %rd611, %rd5986; ld.u32 %rd5988, [%rd5987+-8]; ld.u32 %rd5989, [%rd5987+-4]; bfi.b64 %rd677, %rd5989, %rd5988, 32, 32; or.b64 %rd5990, %rd676, %rd610; and.b64 %rd5991, %rd5990, -4294967296; setp.eq.s64 %p470, %rd5991, 0; @%p470 bra $L__BB1_288; rem.u64 %rd11595, %rd676, %rd610; bra.uni $L__BB1_289; $L__BB1_402: ld.u32 %r2213, [%rd742+76]; cvt.u64.u32 %rd6373, %r2213; setp.le.u64 %p636, %rd733, %rd6373; mul.wide.u32 %rd6374, %r2213, 12; add.s64 %rd6375, %rd734, %rd6374; setp.eq.s64 %p637, %rd6375, 0; or.pred %p638, %p636, %p637; selp.b32 %r214, %r214, %r4874, %p638; selp.b32 %r213, %r213, %r4873, %p638; selp.b32 %r212, %r212, %r4872, %p638; selp.b32 %r216, %r216, %r4887, %p638; selp.b32 %r217, %r217, %r266, %p638; $L__BB1_308: mov.u32 %r218, %r219; setp.eq.s32 %p480, %r218, 0; @%p480 bra $L__BB1_409; mov.b32 %f5296, %r217; cvta.to.local.u64 %rd11355, %rd5901; cvt.u64.u32 %rd6054, %r218; add.s64 %rd6055, %rd6054, -1; cvt.u32.u64 %r219, %rd6055; st.local.u32 [%rd11355+512], %r219; mul.wide.u32 %rd6056, %r218, 8; add.s64 %rd6057, %rd11355, %rd6056; ld.local.u32 %rd740, [%rd6057+-4]; ld.local.u32 %rd6058, [%rd6057+-8]; shl.b64 %rd6059, %rd6058, 32; or.b64 %rd739, %rd6059, 1; mov.b64 {%r2069, %r2070}, %rd740; mov.b32 %f2087, %r2069; neg.f32 %f2088, %f2087; setp.le.f32 %p481, %f5296, %f2088; @%p481 bra $L__BB1_308; mov.b64 {%r2071, %r2072}, %rd739; cvt.u64.u32 %rd741, %r2072; setp.gt.u64 %p482, %rd730, %rd741; @%p482 bra $L__BB1_312; bra.uni $L__BB1_311; $L__BB1_312: mul.lo.s64 %rd6060, %rd741, 96; add.s64 %rd742, %rd732, %rd6060; ld.u8 %rs298, [%rd742+88]; and.b16 %rs299, %rs298, 1; setp.eq.b16 %p484, %rs299, 1; mov.pred %p2915, 0; xor.pred %p485, %p484, %p2915; not.pred %p486, %p485; @%p486 bra $L__BB1_314; ld.v4.u32 {%r2073, %r2074, %r2075, %r2076}, [%rd742+64]; cvt.u64.u32 %rd6061, %r2073; setp.gt.u64 %p488, %rd733, %rd6061; mul.wide.u32 %rd6062, %r2073, 12; add.s64 %rd6063, %rd734, %rd6062; selp.b64 %rd6064, %rd6063, 0, %p488; setp.eq.s64 %p489, %rd6064, 0; add.s64 %rd6065, %rd6064, 8; selp.b64 %rd11616, 0, %rd6065, %p489; cvt.u64.u32 %rd6066, %r2074; setp.gt.u64 %p490, %rd733, %rd6066; mul.wide.u32 %rd6067, %r2074, 12; add.s64 %rd6068, %rd734, %rd6067; selp.b64 %rd6069, %rd6068, 0, %p490; setp.eq.s64 %p491, %rd6069, 0; add.s64 %rd6070, %rd6069, 8; selp.b64 %rd11615, 0, %rd6070, %p491; ld.u32 %r2080, [%rd742+72]; cvt.u64.u32 %rd6071, %r2080; setp.gt.u64 %p492, %rd733, %rd6071; mul.wide.u32 %rd6072, %r2080, 12; add.s64 %rd6073, %rd734, %rd6072; selp.b64 %rd6074, %rd6073, 0, %p492; setp.eq.s64 %p493, %rd6074, 0; add.s64 %rd6075, %rd6074, 8; selp.b64 %rd11614, 0, %rd6075, %p493; cvt.u64.u32 %rd6076, %r2076; setp.gt.u64 %p494, %rd733, %rd6076; mul.wide.u32 %rd6077, %r2076, 12; add.s64 %rd6078, %rd734, %rd6077; selp.b64 %rd6079, %rd6078, 0, %p494; setp.eq.s64 %p495, %rd6079, 0; add.s64 %rd6080, %rd6079, 8; selp.b64 %rd11613, 0, %rd6080, %p495; mov.pred %p2915, -1; $L__BB1_314: mov.b32 %f5297, %r217; ld.v4.f32 {%f2089, %f2090, %f2091, %f2092}, [%rd742]; sub.f32 %f2097, %f2089, %f221; sub.f32 %f2098, %f2090, %f221; sub.f32 %f2099, %f2091, %f221; sub.f32 %f2100, %f2092, %f221; ld.v4.f32 {%f2101, %f2102, %f2103, %f2104}, [%rd742+16]; sub.f32 %f2109, %f2101, %f222; sub.f32 %f2110, %f2102, %f222; sub.f32 %f2111, %f2103, %f222; sub.f32 %f2112, %f2104, %f222; ld.v4.f32 {%f2113, %f2114, %f2115, %f2116}, [%rd742+32]; sub.f32 %f2121, %f221, %f2113; sub.f32 %f2122, %f221, %f2114; sub.f32 %f2123, %f221, %f2115; sub.f32 %f2124, %f221, %f2116; ld.v4.f32 {%f2125, %f2126, %f2127, %f2128}, [%rd742+48]; sub.f32 %f2133, %f222, %f2125; sub.f32 %f2134, %f222, %f2126; sub.f32 %f2135, %f222, %f2127; sub.f32 %f2136, %f222, %f2128; setp.ge.f32 %p496, %f2097, %f2121; selp.f32 %f2137, %f2097, %f2121, %p496; setp.ge.f32 %p497, %f2098, %f2122; selp.f32 %f2138, %f2098, %f2122, %p497; setp.ge.f32 %p498, %f2099, %f2123; selp.f32 %f2139, %f2099, %f2123, %p498; setp.ge.f32 %p499, %f2100, %f2124; selp.f32 %f2140, %f2100, %f2124, %p499; setp.ge.f32 %p500, %f2109, %f2133; selp.f32 %f2141, %f2109, %f2133, %p500; setp.ge.f32 %p501, %f2110, %f2134; selp.f32 %f2142, %f2110, %f2134, %p501; setp.ge.f32 %p502, %f2111, %f2135; selp.f32 %f2143, %f2111, %f2135, %p502; setp.ge.f32 %p503, %f2112, %f2136; selp.f32 %f2144, %f2112, %f2136, %p503; setp.ge.f32 %p504, %f2137, 0f00000000; selp.f32 %f2145, %f2137, 0f00000000, %p504; setp.ge.f32 %p505, %f2138, 0f00000000; selp.f32 %f2146, %f2138, 0f00000000, %p505; setp.ge.f32 %p506, %f2139, 0f00000000; selp.f32 %f2147, %f2139, 0f00000000, %p506; setp.ge.f32 %p507, %f2140, 0f00000000; selp.f32 %f2148, %f2140, 0f00000000, %p507; mov.b32 %r2081, %f2145; mov.b32 %r2082, %f2146; mov.b32 %r2083, %f2147; mov.b32 %r2084, %f2148; cvt.u64.u32 %rd6081, %r2084; cvt.u64.u32 %rd6082, %r2082; cvt.u64.u32 %rd6083, %r2081; cvt.u64.u32 %rd6084, %r2083; bfi.b64 %rd6085, %rd6081, %rd6084, 32, 32; bfi.b64 %rd6086, %rd6082, %rd6083, 32, 32; setp.ge.f32 %p508, %f2141, 0f00000000; selp.f32 %f2149, %f2141, 0f00000000, %p508; setp.ge.f32 %p509, %f2142, 0f00000000; selp.f32 %f2150, %f2142, 0f00000000, %p509; setp.ge.f32 %p510, %f2143, 0f00000000; selp.f32 %f2151, %f2143, 0f00000000, %p510; setp.ge.f32 %p511, %f2144, 0f00000000; selp.f32 %f2152, %f2144, 0f00000000, %p511; mov.b32 %r2085, %f2149; mov.b32 %r2086, %f2150; mov.b32 %r2087, %f2151; mov.b32 %r2088, %f2152; cvt.u64.u32 %rd6087, %r2088; cvt.u64.u32 %rd6088, %r2086; cvt.u64.u32 %rd6089, %r2085; cvt.u64.u32 %rd6090, %r2087; bfi.b64 %rd6091, %rd6087, %rd6090, 32, 32; bfi.b64 %rd6092, %rd6088, %rd6089, 32, 32; mov.b64 {%r2089, %r2090}, %rd6086; mov.b64 {%r2091, %r2092}, %rd6085; cvt.u64.u32 %rd6093, %r2092; cvt.u64.u32 %rd6094, %r2090; cvt.u64.u32 %rd6095, %r2091; bfi.b64 %rd6096, %rd6093, %rd6095, 32, 32; mov.b64 {%r2093, %r2094}, %rd6096; bfi.b64 %rd6097, %rd6094, %rd6083, 32, 32; mov.b64 {%r2095, %r2096}, %rd6097; mov.b32 %f2153, %r2095; mov.b32 %f2154, %r2096; mov.b32 %f2155, %r2093; mov.b32 %f2156, %r2094; mov.b32 %f2157, %r2089; mov.b32 %f2158, %r2090; mov.b32 %f2159, %r2091; mov.b32 %f2160, %r2092; mov.b64 {%r2097, %r2098}, %rd6092; mov.b64 {%r2099, %r2100}, %rd6091; cvt.u64.u32 %rd6098, %r2100; cvt.u64.u32 %rd6099, %r2098; cvt.u64.u32 %rd6100, %r2099; bfi.b64 %rd6101, %rd6098, %rd6100, 32, 32; mov.b64 {%r2101, %r2102}, %rd6101; bfi.b64 %rd6102, %rd6099, %rd6089, 32, 32; mov.b64 {%r2103, %r2104}, %rd6102; mov.b32 %f2161, %r2103; mov.b32 %f2162, %r2104; mov.b32 %f2163, %r2101; mov.b32 %f2164, %r2102; mov.b32 %f2165, %r2097; mov.b32 %f2166, %r2098; mov.b32 %f2167, %r2099; mov.b32 %f2168, %r2100; mul.f32 %f2169, %f2165, %f2161; mul.f32 %f2170, %f2166, %f2162; mul.f32 %f2171, %f2167, %f2163; mul.f32 %f2172, %f2168, %f2164; fma.rn.f32 %f2173, %f2157, %f2153, %f2169; fma.rn.f32 %f2174, %f2158, %f2154, %f2170; fma.rn.f32 %f2175, %f2159, %f2155, %f2171; fma.rn.f32 %f2176, %f2160, %f2156, %f2172; add.f32 %f2177, %f2173, 0f00000000; add.f32 %f2178, %f2174, 0f00000000; add.f32 %f2179, %f2175, 0f00000000; add.f32 %f2180, %f2176, 0f00000000; sqrt.rn.f32 %f2181, %f2177; sqrt.rn.f32 %f2182, %f2178; sqrt.rn.f32 %f2183, %f2179; sqrt.rn.f32 %f2184, %f2180; mov.b32 %r2105, %f2181; mov.b32 %r2106, %f2182; mov.b32 %r2107, %f2183; mov.b32 %r2108, %f2184; cvt.u64.u32 %rd6103, %r2108; cvt.u64.u32 %rd6104, %r2106; cvt.u64.u32 %rd6105, %r2105; cvt.u64.u32 %rd6106, %r2107; bfi.b64 %rd11722, %rd6103, %rd6106, 32, 32; mov.b64 {%r2109, %r2110}, %rd11722; bfi.b64 %rd11721, %rd6104, %rd6105, 32, 32; mov.b64 {%r2111, %r2112}, %rd11721; mov.b32 %f2185, %r2111; mov.b32 %f2186, %r2112; mov.b32 %f2187, %r2109; mov.b32 %f2188, %r2110; setp.lt.f32 %p512, %f2185, %f5297; setp.lt.f32 %p513, %f2186, %f5297; setp.lt.f32 %p514, %f2187, %f5297; setp.lt.f32 %p515, %f2188, %f5297; selp.u32 %r2113, 1, 0, %p512; selp.u32 %r2114, -1, 0, %p513; bfi.b32 %r2115, %r2114, %r2113, 8, 1; selp.u32 %r2116, -1, 0, %p514; bfi.b32 %r2117, %r2116, %r2115, 16, 1; selp.u32 %r2118, -1, 0, %p515; bfi.b32 %r2119, %r2118, %r2117, 24, 1; cvt.u64.u32 %rd6107, %r2119; mov.b64 {%r2120, %r2121}, %rd6107; mov.b32 {%rs300, %rs301}, %r2120; and.b16 %rs302, %rs300, 1; shr.u16 %rs303, %rs300, 7; and.b16 %rs304, %rs303, 2; or.b16 %rs305, %rs304, %rs302; shl.b16 %rs306, %rs301, 2; and.b16 %rs307, %rs306, 4; or.b16 %rs308, %rs305, %rs307; shr.u16 %rs309, %rs301, 5; and.b16 %rs310, %rs309, 8; or.b16 %rs311, %rs308, %rs310; cvt.u64.u16 %rd753, %rs311; @%p2915 bra $L__BB1_316; bra.uni $L__BB1_315; $L__BB1_316: mov.u64 %rd757, 1; st.local.v2.u64 [%rd8], {%rd11616, %rd11615}; st.local.v2.u64 [%rd8+16], {%rd11614, %rd11613}; mov.f32 %f2189, 0f00000000; st.local.v4.f32 [%rd24], {%f2189, %f2189, %f2189, %f2189}; mov.u32 %r2132, 4; st.local.u32 [%rd7+16], %r2132; st.local.u32 [%rd7+52], %r2132; st.local.u32 [%rd7+88], %r2132; st.local.u32 [%rd7+124], %r2132; $L__BB1_317: mov.u64 %rd11370, 1; add.s64 %rd6112, %rd757, -1; cvt.u32.u64 %r2133, %rd6112; shl.b64 %rd6114, %rd11370, %r2133; and.b64 %rd6115, %rd6114, %rd753; setp.eq.s64 %p516, %rd6115, 0; @%p516 bra $L__BB1_370; shl.b64 %rd6116, %rd757, 3; add.s64 %rd6117, %rd8, %rd6116; ld.local.u64 %rd758, [%rd6117+-8]; setp.eq.s64 %p517, %rd758, 0; @%p517 bra $L__BB1_370; ld.u32 %r220, [%rd758]; cvt.u64.u32 %rd759, %r220; ld.global.u64 %rd6118, [%rd608+-160]; setp.gt.u64 %p518, %rd6118, %rd759; @%p518 bra $L__BB1_321; bra.uni $L__BB1_320; $L__BB1_321: ld.global.u64 %rd6119, [%rd608+-168]; mul.lo.s64 %rd6120, %rd759, 12; add.s64 %rd760, %rd6119, %rd6120; ld.u32 %rd761, [%rd760+8]; ld.u32 %rd762, [%rd760]; ld.global.u64 %rd763, [%rd608+-176]; setp.gt.u64 %p519, %rd763, %rd762; @%p519 bra $L__BB1_323; bra.uni $L__BB1_322; $L__BB1_323: ld.global.u64 %rd764, [%rd608+-184]; shl.b64 %rd6121, %rd762, 3; add.s64 %rd6122, %rd764, %rd6121; ld.u32 %rd6123, [%rd6122]; ld.u32 %rd6124, [%rd6122+4]; bfi.b64 %rd765, %rd6124, %rd6123, 32, 32; ld.u32 %rd766, [%rd760+4]; setp.gt.u64 %p520, %rd763, %rd766; @%p520 bra $L__BB1_325; bra.uni $L__BB1_324; $L__BB1_325: setp.gt.u64 %p521, %rd763, %rd761; @%p521 bra $L__BB1_327; bra.uni $L__BB1_326; $L__BB1_327: shl.b64 %rd6125, %rd766, 3; add.s64 %rd6126, %rd764, %rd6125; shl.b64 %rd6127, %rd761, 3; add.s64 %rd6128, %rd764, %rd6127; cvt.u32.u64 %r2134, %rd765; mov.b32 %f224, %r2134; shr.u64 %rd6129, %rd765, 32; cvt.u32.u64 %r2135, %rd6129; mov.b32 %f225, %r2135; ld.u32 %rd6130, [%rd6126]; ld.u32 %rd6131, [%rd6126+4]; bfi.b64 %rd767, %rd6131, %rd6130, 32, 32; cvt.u32.u64 %r2136, %rd767; shr.u64 %rd6132, %rd767, 32; cvt.u32.u64 %r2137, %rd6132; mov.b32 %f226, %r2136; sub.f32 %f227, %f226, %f224; mov.b32 %f5339, %r2137; sub.f32 %f229, %f5339, %f225; ld.u32 %rd6133, [%rd6128]; ld.u32 %rd6134, [%rd6128+4]; bfi.b64 %rd768, %rd6134, %rd6133, 32, 32; cvt.u32.u64 %r2138, %rd768; shr.u64 %rd6135, %rd768, 32; cvt.u32.u64 %r2139, %rd6135; mov.b32 %f230, %r2138; sub.f32 %f231, %f230, %f224; mov.b32 %f232, %r2139; sub.f32 %f233, %f232, %f225; sub.f32 %f234, %f221, %f224; sub.f32 %f235, %f222, %f225; mul.f32 %f2190, %f229, %f235; fma.rn.f32 %f236, %f227, %f234, %f2190; mul.f32 %f2191, %f233, %f235; fma.rn.f32 %f237, %f231, %f234, %f2191; setp.le.f32 %p522, %f236, 0f00000000; setp.le.f32 %p523, %f237, 0f00000000; and.pred %p524, %p522, %p523; @%p524 bra $L__BB1_365; bra.uni $L__BB1_328; $L__BB1_365: add.u64 %rd11707, %SP, 552; cvta.to.local.u64 %rd11705, %rd11707; add.u64 %rd11713, %SP, 0; cvta.to.local.u64 %rd11711, %rd11713; st.local.u64 [%rd11711], %rd765; mov.u64 %rd11718, 2; mov.u64 %rd11704, %rd586; mov.u64 %rd11706, %rd11705; mov.u64 %rd11708, %rd11705; mov.u64 %rd11709, %rd11705; mov.u64 %rd11710, %rd11707; mov.u64 %rd11712, %rd11711; mov.u64 %rd11714, %rd11711; mov.u64 %rd11715, %rd11711; mov.u64 %rd11716, %rd11713; mov.u64 %rd11717, %rd580; $L__BB1_366: setp.eq.s64 %p577, %rd11718, 0; mov.u64 %rd11719, 1; @%p577 bra $L__BB1_368; add.s64 %rd11718, %rd11718, -1; add.s64 %rd6280, %rd11705, 8; setp.eq.s64 %p578, %rd11708, %rd11704; selp.b64 %rd6281, %rd6280, %rd11708, %p578; add.s64 %rd6282, %rd11706, 8; selp.b64 %rd6283, %rd6282, %rd11709, %p578; add.s64 %rd6284, %rd11707, 8; selp.b64 %rd6285, %rd6284, %rd11710, %p578; mov.u64 %rd11719, 0; setp.eq.s64 %p579, %rd11718, 0; add.s64 %rd6286, %rd6281, 4; add.s64 %rd6287, %rd6283, 4; add.s64 %rd6288, %rd6285, 4; selp.b64 %rd994, %rd6281, %rd6286, %p579; selp.b64 %rd11709, %rd6283, %rd6287, %p579; selp.b64 %rd11710, %rd6285, %rd6288, %p579; selp.b64 %rd11705, %rd6280, %rd11705, %p578; selp.b64 %rd11706, %rd6282, %rd11706, %p578; selp.b64 %rd11707, %rd6284, %rd11707, %p578; add.s64 %rd6289, %rd11708, 8; selp.b64 %rd11704, %rd6289, %rd11704, %p578; add.s64 %rd6290, %rd11714, 8; setp.eq.s64 %p580, %rd11711, %rd11717; selp.b64 %rd6291, %rd6290, %rd11711, %p580; add.s64 %rd6292, %rd11715, 8; selp.b64 %rd6293, %rd6292, %rd11712, %p580; add.s64 %rd6294, %rd11716, 8; selp.b64 %rd6295, %rd6294, %rd11713, %p580; selp.b64 %rd11714, %rd6290, %rd11714, %p580; selp.b64 %rd11715, %rd6292, %rd11715, %p580; selp.b64 %rd11716, %rd6294, %rd11716, %p580; add.s64 %rd6296, %rd11711, 8; selp.b64 %rd11717, %rd6296, %rd11717, %p580; add.s64 %rd6297, %rd6291, 4; add.s64 %rd6298, %rd6293, 4; add.s64 %rd6299, %rd6295, 4; selp.b64 %rd11711, %rd6291, %rd6297, %p579; selp.b64 %rd11712, %rd6293, %rd6298, %p579; selp.b64 %rd11713, %rd6295, %rd6299, %p579; ld.local.f32 %f2257, [%rd6293]; ld.local.f32 %f2258, [%rd6283]; setp.eq.f32 %p581, %f2258, %f2257; mov.u64 %rd11708, %rd994; @%p581 bra $L__BB1_366; $L__BB1_368: cvt.u32.u64 %r4774, %rd765; mov.u64 %rd11046, 0; or.b64 %rd6301, %rd11046, %rd765; mov.b64 {%r2181, %r2182}, %rd6301; mov.b64 {%r2183, %r2184}, %rd11719; cvt.u32.u64 %r2186, %rd11046; or.b32 %r4869, %r2186, %r4774; mov.u32 %r4870, 0; mov.b32 %f5343, %r2182; mov.b32 {%rs1020, %rs330}, %r2183; mov.u32 %r4871, %r4870; bra.uni $L__BB1_369; $L__BB1_328: cvt.u32.u64 %r4753, %rd767; mov.b32 %f5284, %r4753; sub.f32 %f238, %f221, %f5284; sub.f32 %f239, %f222, %f5339; mul.f32 %f2192, %f229, %f239; fma.rn.f32 %f240, %f227, %f238, %f2192; mul.f32 %f2193, %f233, %f239; fma.rn.f32 %f241, %f231, %f238, %f2193; setp.ge.f32 %p525, %f240, 0f00000000; setp.le.f32 %p526, %f241, %f240; and.pred %p527, %p526, %p525; @%p527 bra $L__BB1_361; bra.uni $L__BB1_329; $L__BB1_361: add.u64 %rd11691, %SP, 552; cvta.to.local.u64 %rd11689, %rd11691; add.u64 %rd11697, %SP, 0; cvta.to.local.u64 %rd11695, %rd11697; st.local.u64 [%rd11695], %rd767; mov.u64 %rd11702, 2; mov.u64 %rd11688, %rd586; mov.u64 %rd11690, %rd11689; mov.u64 %rd11692, %rd11689; mov.u64 %rd11693, %rd11689; mov.u64 %rd11694, %rd11691; mov.u64 %rd11696, %rd11695; mov.u64 %rd11698, %rd11695; mov.u64 %rd11699, %rd11695; mov.u64 %rd11700, %rd11697; mov.u64 %rd11701, %rd581; $L__BB1_362: setp.eq.s64 %p572, %rd11702, 0; mov.u64 %rd11703, 1; @%p572 bra $L__BB1_364; add.s64 %rd11702, %rd11702, -1; add.s64 %rd6253, %rd11689, 8; setp.eq.s64 %p573, %rd11692, %rd11688; selp.b64 %rd6254, %rd6253, %rd11692, %p573; add.s64 %rd6255, %rd11690, 8; selp.b64 %rd6256, %rd6255, %rd11693, %p573; add.s64 %rd6257, %rd11691, 8; selp.b64 %rd6258, %rd6257, %rd11694, %p573; mov.u64 %rd11703, 0; setp.eq.s64 %p574, %rd11702, 0; add.s64 %rd6259, %rd6254, 4; add.s64 %rd6260, %rd6256, 4; add.s64 %rd6261, %rd6258, 4; selp.b64 %rd956, %rd6254, %rd6259, %p574; selp.b64 %rd11693, %rd6256, %rd6260, %p574; selp.b64 %rd11694, %rd6258, %rd6261, %p574; selp.b64 %rd11689, %rd6253, %rd11689, %p573; selp.b64 %rd11690, %rd6255, %rd11690, %p573; selp.b64 %rd11691, %rd6257, %rd11691, %p573; add.s64 %rd6262, %rd11692, 8; selp.b64 %rd11688, %rd6262, %rd11688, %p573; add.s64 %rd6263, %rd11698, 8; setp.eq.s64 %p575, %rd11695, %rd11701; selp.b64 %rd6264, %rd6263, %rd11695, %p575; add.s64 %rd6265, %rd11699, 8; selp.b64 %rd6266, %rd6265, %rd11696, %p575; add.s64 %rd6267, %rd11700, 8; selp.b64 %rd6268, %rd6267, %rd11697, %p575; selp.b64 %rd11698, %rd6263, %rd11698, %p575; selp.b64 %rd11699, %rd6265, %rd11699, %p575; selp.b64 %rd11700, %rd6267, %rd11700, %p575; add.s64 %rd6269, %rd11695, 8; selp.b64 %rd11701, %rd6269, %rd11701, %p575; add.s64 %rd6270, %rd6264, 4; add.s64 %rd6271, %rd6266, 4; add.s64 %rd6272, %rd6268, 4; selp.b64 %rd11695, %rd6264, %rd6270, %p574; selp.b64 %rd11696, %rd6266, %rd6271, %p574; selp.b64 %rd11697, %rd6268, %rd6272, %p574; ld.local.f32 %f2255, [%rd6266]; ld.local.f32 %f2256, [%rd6256]; setp.eq.f32 %p576, %f2256, %f2255; mov.u64 %rd11692, %rd956; @%p576 bra $L__BB1_362; $L__BB1_364: cvt.u32.u64 %r4773, %rd767; mov.u64 %rd11045, 0; or.b64 %rd6274, %rd11045, %rd767; mov.b64 {%r2173, %r2174}, %rd6274; mov.b64 {%r2175, %r2176}, %rd11703; cvt.u32.u64 %r2178, %rd11045; or.b32 %r4869, %r2178, %r4773; mov.u32 %r4870, 0; mov.b32 %f5343, %r2174; mov.u32 %r4871, 1; mov.b32 {%rs1020, %rs326}, %r2175; bra.uni $L__BB1_369; $L__BB1_329: shr.u64 %rd11348, %rd768, 32; cvt.u32.u64 %r4755, %rd11348; mov.b32 %f5286, %r4755; cvt.u32.u64 %r4754, %rd768; mov.b32 %f5285, %r4754; sub.f32 %f242, %f221, %f5285; sub.f32 %f243, %f222, %f5286; mul.f32 %f2194, %f229, %f243; fma.rn.f32 %f244, %f227, %f242, %f2194; mul.f32 %f2195, %f233, %f243; fma.rn.f32 %f245, %f231, %f242, %f2195; setp.ge.f32 %p528, %f245, 0f00000000; setp.le.f32 %p529, %f244, %f245; and.pred %p530, %p529, %p528; @%p530 bra $L__BB1_357; bra.uni $L__BB1_330; $L__BB1_357: add.u64 %rd11675, %SP, 552; cvta.to.local.u64 %rd11673, %rd11675; add.u64 %rd11681, %SP, 0; cvta.to.local.u64 %rd11679, %rd11681; st.local.u64 [%rd11679], %rd768; mov.u64 %rd11686, 2; mov.u64 %rd11672, %rd586; mov.u64 %rd11674, %rd11673; mov.u64 %rd11676, %rd11673; mov.u64 %rd11677, %rd11673; mov.u64 %rd11678, %rd11675; mov.u64 %rd11680, %rd11679; mov.u64 %rd11682, %rd11679; mov.u64 %rd11683, %rd11679; mov.u64 %rd11684, %rd11681; mov.u64 %rd11685, %rd582; $L__BB1_358: setp.eq.s64 %p567, %rd11686, 0; mov.u64 %rd11687, 1; @%p567 bra $L__BB1_360; add.s64 %rd11686, %rd11686, -1; add.s64 %rd6226, %rd11673, 8; setp.eq.s64 %p568, %rd11676, %rd11672; selp.b64 %rd6227, %rd6226, %rd11676, %p568; add.s64 %rd6228, %rd11674, 8; selp.b64 %rd6229, %rd6228, %rd11677, %p568; add.s64 %rd6230, %rd11675, 8; selp.b64 %rd6231, %rd6230, %rd11678, %p568; mov.u64 %rd11687, 0; setp.eq.s64 %p569, %rd11686, 0; add.s64 %rd6232, %rd6227, 4; add.s64 %rd6233, %rd6229, 4; add.s64 %rd6234, %rd6231, 4; selp.b64 %rd918, %rd6227, %rd6232, %p569; selp.b64 %rd11677, %rd6229, %rd6233, %p569; selp.b64 %rd11678, %rd6231, %rd6234, %p569; selp.b64 %rd11673, %rd6226, %rd11673, %p568; selp.b64 %rd11674, %rd6228, %rd11674, %p568; selp.b64 %rd11675, %rd6230, %rd11675, %p568; add.s64 %rd6235, %rd11676, 8; selp.b64 %rd11672, %rd6235, %rd11672, %p568; add.s64 %rd6236, %rd11682, 8; setp.eq.s64 %p570, %rd11679, %rd11685; selp.b64 %rd6237, %rd6236, %rd11679, %p570; add.s64 %rd6238, %rd11683, 8; selp.b64 %rd6239, %rd6238, %rd11680, %p570; add.s64 %rd6240, %rd11684, 8; selp.b64 %rd6241, %rd6240, %rd11681, %p570; selp.b64 %rd11682, %rd6236, %rd11682, %p570; selp.b64 %rd11683, %rd6238, %rd11683, %p570; selp.b64 %rd11684, %rd6240, %rd11684, %p570; add.s64 %rd6242, %rd11679, 8; selp.b64 %rd11685, %rd6242, %rd11685, %p570; add.s64 %rd6243, %rd6237, 4; add.s64 %rd6244, %rd6239, 4; add.s64 %rd6245, %rd6241, 4; selp.b64 %rd11679, %rd6237, %rd6243, %p569; selp.b64 %rd11680, %rd6239, %rd6244, %p569; selp.b64 %rd11681, %rd6241, %rd6245, %p569; ld.local.f32 %f2253, [%rd6239]; ld.local.f32 %f2254, [%rd6229]; setp.eq.f32 %p571, %f2254, %f2253; mov.u64 %rd11676, %rd918; @%p571 bra $L__BB1_358; $L__BB1_360: cvt.u32.u64 %r4772, %rd768; mov.u64 %rd11044, 0; or.b64 %rd6247, %rd11044, %rd768; mov.b64 {%r2165, %r2166}, %rd6247; mov.b64 {%r2167, %r2168}, %rd11687; cvt.u32.u64 %r2170, %rd11044; or.b32 %r4869, %r2170, %r4772; mov.u32 %r4870, 0; mov.b32 %f5343, %r2166; mov.b32 {%rs1020, %rs322}, %r2167; mov.u32 %r4871, 2; bra.uni $L__BB1_369; $L__BB1_330: cvt.u32.u64 %r4760, %rd765; mov.b32 %f5293, %r4760; sub.f32 %f5292, %f221, %f5293; shr.u64 %rd11350, %rd765, 32; cvt.u32.u64 %r4759, %rd11350; mov.b32 %f5291, %r4759; sub.f32 %f5290, %f222, %f5291; shr.u64 %rd11349, %rd768, 32; cvt.u32.u64 %r4758, %rd11349; mov.b32 %f5289, %r4758; cvt.u32.u64 %r4757, %rd768; mov.b32 %f5288, %r4757; cvt.u32.u64 %r4756, %rd767; mov.b32 %f5287, %r4756; sub.f32 %f246, %f5288, %f5287; sub.f32 %f247, %f5289, %f5339; mul.f32 %f2196, %f229, %f231; mul.f32 %f2197, %f227, %f233; sub.f32 %f248, %f2197, %f2196; mul.f32 %f2198, %f229, %f5292; mul.f32 %f2199, %f227, %f5290; sub.f32 %f2200, %f2199, %f2198; mul.f32 %f2201, %f248, %f2200; setp.lt.f32 %p531, %f2201, 0f00000000; setp.ge.f32 %p532, %f236, 0f00000000; and.pred %p533, %p532, %p531; setp.le.f32 %p534, %f240, 0f00000000; and.pred %p535, %p534, %p533; mov.u16 %rs1019, 0; @%p535 bra $L__BB1_333; cvt.u32.u64 %r4776, %rd768; mov.b32 %f5311, %r4776; sub.f32 %f5310, %f221, %f5311; shr.u64 %rd11372, %rd768, 32; cvt.u32.u64 %r4775, %rd11372; mov.b32 %f5309, %r4775; sub.f32 %f5308, %f222, %f5309; mul.f32 %f2202, %f231, %f5308; mul.f32 %f2203, %f5310, %f233; sub.f32 %f2204, %f2202, %f2203; mul.f32 %f2205, %f248, %f2204; setp.gt.f32 %p536, %f2205, 0f80000000; setp.ge.f32 %p537, %f237, 0f00000000; and.pred %p538, %p537, %p536; setp.le.f32 %p539, %f245, 0f00000000; and.pred %p540, %p539, %p538; mov.u16 %rs1019, 1; @%p540 bra $L__BB1_333; mul.f32 %f2206, %f246, %f239; mul.f32 %f2207, %f238, %f247; sub.f32 %f2208, %f2206, %f2207; mul.f32 %f2209, %f248, %f2208; setp.lt.f32 %p541, %f2209, 0f00000000; sub.f32 %f2210, %f241, %f240; setp.ge.f32 %p542, %f2210, 0f00000000; and.pred %p543, %p542, %p541; sub.f32 %f2211, %f244, %f245; setp.ge.f32 %p544, %f2211, 0f00000000; and.pred %p545, %p544, %p543; selp.b16 %rs1019, 2, 3, %p545; $L__BB1_333: mul.f32 %f2212, %f229, %f229; fma.rn.f32 %f2213, %f227, %f227, %f2212; add.f32 %f249, %f2213, 0f00000000; mul.f32 %f2214, %f233, %f233; fma.rn.f32 %f2215, %f231, %f231, %f2214; add.f32 %f250, %f2215, 0f00000000; mul.f32 %f2216, %f247, %f247; fma.rn.f32 %f2217, %f246, %f246, %f2216; add.f32 %f251, %f2217, 0f00000000; setp.eq.s16 %p546, %rs1019, 1; @%p546 bra $L__BB1_348; setp.eq.s16 %p547, %rs1019, 2; @%p547 bra $L__BB1_344; setp.ne.s16 %p548, %rs1019, 3; @%p548 bra $L__BB1_352; cvt.u32.u64 %r4765, %rd765; mov.b32 %f5301, %r4765; sub.f32 %f5300, %f221, %f5301; shr.u64 %rd11371, %rd765, 32; cvt.u32.u64 %r4764, %rd11371; mov.b32 %f5299, %r4764; sub.f32 %f5298, %f222, %f5299; sub.f32 %f2218, %f236, %f240; div.rn.f32 %f252, %f236, %f2218; sub.f32 %f2219, %f237, %f245; div.rn.f32 %f253, %f237, %f2219; sub.f32 %f2220, %f241, %f240; add.f32 %f2221, %f244, %f2220; sub.f32 %f2222, %f2221, %f245; div.rn.f32 %f5341, %f2220, %f2222; mul.f32 %f2223, %f5298, %f5298; fma.rn.f32 %f2224, %f5300, %f5300, %f2223; add.f32 %f2225, %f2224, 0f00000000; mul.f32 %f2226, %f249, %f252; mul.f32 %f2227, %f252, %f2226; sub.f32 %f255, %f2225, %f2227; mul.f32 %f2228, %f250, %f5341; mul.f32 %f2229, %f5341, %f2228; sub.f32 %f256, %f2225, %f2229; mul.f32 %f2230, %f239, %f239; fma.rn.f32 %f2231, %f238, %f238, %f2230; add.f32 %f2232, %f2231, 0f00000000; mul.f32 %f2233, %f251, %f253; mul.f32 %f2234, %f253, %f2233; sub.f32 %f257, %f2232, %f2234; setp.lt.f32 %p549, %f255, %f256; @%p549 bra $L__BB1_340; bra.uni $L__BB1_337; $L__BB1_340: setp.lt.f32 %p551, %f255, %f257; @%p551 bra $L__BB1_342; bra.uni $L__BB1_341; $L__BB1_342: cvt.u32.u64 %r4769, %rd765; mov.b32 %f5305, %r4769; mul.f32 %f5340, %f229, %f252; fma.rn.f32 %f5338, %f227, %f252, %f5305; mov.u32 %r4871, 0; mov.f32 %f5339, %f225; mov.f32 %f5341, %f252; bra.uni $L__BB1_343; $L__BB1_344: cvt.u32.u64 %r4770, %rd767; mov.b32 %f5306, %r4770; add.u64 %rd11625, %SP, 552; cvta.to.local.u64 %rd11623, %rd11625; add.u64 %rd11631, %SP, 0; cvta.to.local.u64 %rd11629, %rd11631; mul.f32 %f2237, %f247, %f239; fma.rn.f32 %f2238, %f246, %f238, %f2237; div.rn.f32 %f5342, %f2238, %f251; fma.rn.f32 %f2239, %f246, %f5342, %f5306; mov.b32 %r2147, %f2239; fma.rn.f32 %f2240, %f247, %f5342, %f5339; mov.b32 %r2148, %f2240; cvt.u64.u32 %rd6139, %r2148; cvt.u64.u32 %rd6140, %r2147; bfi.b64 %rd776, %rd6139, %rd6140, 32, 32; st.local.u64 [%rd11629], %rd776; mov.u64 %rd11636, 2; mov.u64 %rd11622, %rd586; mov.u64 %rd11624, %rd11623; mov.u64 %rd11626, %rd11623; mov.u64 %rd11627, %rd11623; mov.u64 %rd11628, %rd11625; mov.u64 %rd11630, %rd11629; mov.u64 %rd11632, %rd11629; mov.u64 %rd11633, %rd11629; mov.u64 %rd11634, %rd11631; mov.u64 %rd11635, %rd585; $L__BB1_345: setp.eq.s64 %p552, %rd11636, 0; mov.u64 %rd11671, 1; @%p552 bra $L__BB1_347; add.s64 %rd11636, %rd11636, -1; add.s64 %rd6145, %rd11623, 8; setp.eq.s64 %p553, %rd11626, %rd11622; selp.b64 %rd6146, %rd6145, %rd11626, %p553; add.s64 %rd6147, %rd11624, 8; selp.b64 %rd6148, %rd6147, %rd11627, %p553; add.s64 %rd6149, %rd11625, 8; selp.b64 %rd6150, %rd6149, %rd11628, %p553; mov.u64 %rd11671, 0; setp.eq.s64 %p554, %rd11636, 0; add.s64 %rd6151, %rd6146, 4; add.s64 %rd6152, %rd6148, 4; add.s64 %rd6153, %rd6150, 4; selp.b64 %rd793, %rd6146, %rd6151, %p554; selp.b64 %rd11627, %rd6148, %rd6152, %p554; selp.b64 %rd11628, %rd6150, %rd6153, %p554; selp.b64 %rd11623, %rd6145, %rd11623, %p553; selp.b64 %rd11624, %rd6147, %rd11624, %p553; selp.b64 %rd11625, %rd6149, %rd11625, %p553; add.s64 %rd6154, %rd11626, 8; selp.b64 %rd11622, %rd6154, %rd11622, %p553; add.s64 %rd6155, %rd11632, 8; setp.eq.s64 %p555, %rd11629, %rd11635; selp.b64 %rd6156, %rd6155, %rd11629, %p555; add.s64 %rd6157, %rd11633, 8; selp.b64 %rd6158, %rd6157, %rd11630, %p555; add.s64 %rd6159, %rd11634, 8; selp.b64 %rd6160, %rd6159, %rd11631, %p555; selp.b64 %rd11632, %rd6155, %rd11632, %p555; selp.b64 %rd11633, %rd6157, %rd11633, %p555; selp.b64 %rd11634, %rd6159, %rd11634, %p555; add.s64 %rd6161, %rd11629, 8; selp.b64 %rd11635, %rd6161, %rd11635, %p555; add.s64 %rd6162, %rd6156, 4; add.s64 %rd6163, %rd6158, 4; add.s64 %rd6164, %rd6160, 4; selp.b64 %rd11629, %rd6156, %rd6162, %p554; selp.b64 %rd11630, %rd6158, %rd6163, %p554; selp.b64 %rd11631, %rd6160, %rd6164, %p554; ld.local.f32 %f2241, [%rd6158]; ld.local.f32 %f2242, [%rd6148]; setp.eq.f32 %p556, %f2242, %f2241; mov.u64 %rd11626, %rd793; @%p556 bra $L__BB1_345; $L__BB1_347: mov.u64 %rd11041, 0; or.b64 %rd11670, %rd11041, %rd776; mov.u32 %r4871, 1; bra.uni $L__BB1_356; $L__BB1_348: cvt.u32.u64 %r4771, %rd765; mov.b32 %f5307, %r4771; add.u64 %rd11641, %SP, 552; cvta.to.local.u64 %rd11639, %rd11641; add.u64 %rd11647, %SP, 0; cvta.to.local.u64 %rd11645, %rd11647; div.rn.f32 %f5342, %f237, %f250; fma.rn.f32 %f2243, %f231, %f5342, %f5307; mov.b32 %r2150, %f2243; fma.rn.f32 %f2244, %f233, %f5342, %f225; mov.b32 %r2151, %f2244; cvt.u64.u32 %rd6166, %r2151; cvt.u64.u32 %rd6167, %r2150; bfi.b64 %rd817, %rd6166, %rd6167, 32, 32; st.local.u64 [%rd11645], %rd817; mov.u64 %rd11652, 2; mov.u64 %rd11638, %rd586; mov.u64 %rd11640, %rd11639; mov.u64 %rd11642, %rd11639; mov.u64 %rd11643, %rd11639; mov.u64 %rd11644, %rd11641; mov.u64 %rd11646, %rd11645; mov.u64 %rd11648, %rd11645; mov.u64 %rd11649, %rd11645; mov.u64 %rd11650, %rd11647; mov.u64 %rd11651, %rd584; $L__BB1_349: setp.eq.s64 %p557, %rd11652, 0; mov.u64 %rd11671, 1; @%p557 bra $L__BB1_351; add.s64 %rd11652, %rd11652, -1; add.s64 %rd6172, %rd11639, 8; setp.eq.s64 %p558, %rd11642, %rd11638; selp.b64 %rd6173, %rd6172, %rd11642, %p558; add.s64 %rd6174, %rd11640, 8; selp.b64 %rd6175, %rd6174, %rd11643, %p558; add.s64 %rd6176, %rd11641, 8; selp.b64 %rd6177, %rd6176, %rd11644, %p558; mov.u64 %rd11671, 0; setp.eq.s64 %p559, %rd11652, 0; add.s64 %rd6178, %rd6173, 4; add.s64 %rd6179, %rd6175, 4; add.s64 %rd6180, %rd6177, 4; selp.b64 %rd834, %rd6173, %rd6178, %p559; selp.b64 %rd11643, %rd6175, %rd6179, %p559; selp.b64 %rd11644, %rd6177, %rd6180, %p559; selp.b64 %rd11639, %rd6172, %rd11639, %p558; selp.b64 %rd11640, %rd6174, %rd11640, %p558; selp.b64 %rd11641, %rd6176, %rd11641, %p558; add.s64 %rd6181, %rd11642, 8; selp.b64 %rd11638, %rd6181, %rd11638, %p558; add.s64 %rd6182, %rd11648, 8; setp.eq.s64 %p560, %rd11645, %rd11651; selp.b64 %rd6183, %rd6182, %rd11645, %p560; add.s64 %rd6184, %rd11649, 8; selp.b64 %rd6185, %rd6184, %rd11646, %p560; add.s64 %rd6186, %rd11650, 8; selp.b64 %rd6187, %rd6186, %rd11647, %p560; selp.b64 %rd11648, %rd6182, %rd11648, %p560; selp.b64 %rd11649, %rd6184, %rd11649, %p560; selp.b64 %rd11650, %rd6186, %rd11650, %p560; add.s64 %rd6188, %rd11645, 8; selp.b64 %rd11651, %rd6188, %rd11651, %p560; add.s64 %rd6189, %rd6183, 4; add.s64 %rd6190, %rd6185, 4; add.s64 %rd6191, %rd6187, 4; selp.b64 %rd11645, %rd6183, %rd6189, %p559; selp.b64 %rd11646, %rd6185, %rd6190, %p559; selp.b64 %rd11647, %rd6187, %rd6191, %p559; ld.local.f32 %f2245, [%rd6185]; ld.local.f32 %f2246, [%rd6175]; setp.eq.f32 %p561, %f2246, %f2245; mov.u64 %rd11642, %rd834; @%p561 bra $L__BB1_349; $L__BB1_351: mov.u64 %rd11042, 0; or.b64 %rd11670, %rd11042, %rd817; mov.u32 %r4871, 2; bra.uni $L__BB1_356; $L__BB1_352: cvt.u32.u64 %r4761, %rd765; mov.b32 %f5294, %r4761; div.rn.f32 %f5342, %f236, %f249; fma.rn.f32 %f2247, %f227, %f5342, %f5294; mov.b32 %r2153, %f2247; fma.rn.f32 %f2248, %f229, %f5342, %f225; mov.b32 %r2154, %f2248; cvt.u64.u32 %rd6193, %r2154; cvt.u64.u32 %rd6194, %r2153; bfi.b64 %rd858, %rd6193, %rd6194, 32, 32; st.local.u64 [%rd5888], %rd858; mov.u64 %rd11668, 2; mov.u64 %rd11654, %rd586; mov.u64 %rd11655, %rd5900; mov.u64 %rd11656, %rd5900; mov.u64 %rd11657, %rd5899; mov.u64 %rd11658, %rd5900; mov.u64 %rd11659, %rd5900; mov.u64 %rd11660, %rd5899; mov.u64 %rd11661, %rd5888; mov.u64 %rd11662, %rd5888; mov.u64 %rd11663, %rd5887; mov.u64 %rd11664, %rd5888; mov.u64 %rd11665, %rd5888; mov.u64 %rd11666, %rd5887; mov.u64 %rd11667, %rd583; $L__BB1_353: setp.eq.s64 %p562, %rd11668, 0; mov.u64 %rd11671, 1; @%p562 bra $L__BB1_355; add.s64 %rd11668, %rd11668, -1; add.s64 %rd6199, %rd11655, 8; setp.eq.s64 %p563, %rd11658, %rd11654; selp.b64 %rd6200, %rd6199, %rd11658, %p563; add.s64 %rd6201, %rd11656, 8; selp.b64 %rd6202, %rd6201, %rd11659, %p563; add.s64 %rd6203, %rd11657, 8; selp.b64 %rd6204, %rd6203, %rd11660, %p563; mov.u64 %rd11671, 0; setp.eq.s64 %p564, %rd11668, 0; add.s64 %rd6205, %rd6200, 4; add.s64 %rd6206, %rd6202, 4; add.s64 %rd6207, %rd6204, 4; selp.b64 %rd875, %rd6200, %rd6205, %p564; selp.b64 %rd11659, %rd6202, %rd6206, %p564; selp.b64 %rd11660, %rd6204, %rd6207, %p564; selp.b64 %rd11655, %rd6199, %rd11655, %p563; selp.b64 %rd11656, %rd6201, %rd11656, %p563; selp.b64 %rd11657, %rd6203, %rd11657, %p563; add.s64 %rd6208, %rd11658, 8; selp.b64 %rd11654, %rd6208, %rd11654, %p563; add.s64 %rd6209, %rd11664, 8; setp.eq.s64 %p565, %rd11661, %rd11667; selp.b64 %rd6210, %rd6209, %rd11661, %p565; add.s64 %rd6211, %rd11665, 8; selp.b64 %rd6212, %rd6211, %rd11662, %p565; add.s64 %rd6213, %rd11666, 8; selp.b64 %rd6214, %rd6213, %rd11663, %p565; selp.b64 %rd11664, %rd6209, %rd11664, %p565; selp.b64 %rd11665, %rd6211, %rd11665, %p565; selp.b64 %rd11666, %rd6213, %rd11666, %p565; add.s64 %rd6215, %rd11661, 8; selp.b64 %rd11667, %rd6215, %rd11667, %p565; add.s64 %rd6216, %rd6210, 4; add.s64 %rd6217, %rd6212, 4; add.s64 %rd6218, %rd6214, 4; selp.b64 %rd11661, %rd6210, %rd6216, %p564; selp.b64 %rd11662, %rd6212, %rd6217, %p564; selp.b64 %rd11663, %rd6214, %rd6218, %p564; ld.local.f32 %f2249, [%rd6212]; ld.local.f32 %f2250, [%rd6202]; setp.eq.f32 %p566, %f2250, %f2249; mov.u64 %rd11658, %rd875; @%p566 bra $L__BB1_353; $L__BB1_355: mov.u64 %rd11043, 0; or.b64 %rd11670, %rd11043, %rd858; mov.u32 %r4871, 0; $L__BB1_356: mov.f32 %f2251, 0f3F800000; sub.f32 %f2252, %f2251, %f5342; mov.b32 %r2157, %f2252; mov.b32 %r2158, %f5342; cvt.u64.u32 %rd6219, %r2158; cvt.u64.u32 %rd6220, %r2157; bfi.b64 %rd11720, %rd6219, %rd6220, 32, 32; mov.b64 {%r2159, %r2160}, %rd11671; mov.b64 {%r2161, %r2162}, %rd11670; cvt.u32.u64 %r4869, %rd11670; mov.b32 %f5343, %r2162; mov.u32 %r4870, 1; mov.b32 {%rs1020, %rs318}, %r2159; bra.uni $L__BB1_369; $L__BB1_337: setp.lt.f32 %p550, %f256, %f257; @%p550 bra $L__BB1_339; bra.uni $L__BB1_338; $L__BB1_339: cvt.u32.u64 %r4767, %rd765; mov.b32 %f5303, %r4767; mul.f32 %f5340, %f233, %f253; fma.rn.f32 %f5338, %f231, %f253, %f5303; mov.u32 %r4871, 2; mov.f32 %f5339, %f225; mov.f32 %f5341, %f253; bra.uni $L__BB1_343; $L__BB1_341: cvt.u32.u64 %r4768, %rd767; mov.b32 %f5304, %r4768; mul.f32 %f5340, %f247, %f5341; fma.rn.f32 %f5338, %f246, %f5341, %f5304; mov.u32 %r4871, 1; bra.uni $L__BB1_343; $L__BB1_338: cvt.u32.u64 %r4766, %rd767; mov.b32 %f5302, %r4766; mul.f32 %f5340, %f247, %f5341; fma.rn.f32 %f5338, %f246, %f5341, %f5302; mov.u32 %r4871, 1; $L__BB1_343: add.f32 %f5343, %f5339, %f5340; mov.f32 %f2235, 0f3F800000; sub.f32 %f2236, %f2235, %f5341; mov.b32 %r2145, %f2236; mov.b32 %r2146, %f5341; cvt.u64.u32 %rd6136, %r2146; cvt.u64.u32 %rd6137, %r2145; bfi.b64 %rd11720, %rd6136, %rd6137, 32, 32; mov.b32 %r4869, %f5338; mov.u32 %r4870, 1; mov.u16 %rs1020, 1; $L__BB1_369: mov.b32 %f2259, %r4869; sub.f32 %f2260, %f2259, %f221; sub.f32 %f2261, %f5343, %f222; mul.f32 %f2262, %f2261, %f2261; fma.rn.f32 %f2263, %f2260, %f2260, %f2262; add.f32 %f2264, %f2263, 0f00000000; sqrt.rn.f32 %f2265, %f2264; shl.b64 %rd6304, %rd757, 2; add.s64 %rd6305, %rd24, %rd6304; st.local.f32 [%rd6305+-4], %f2265; mul.lo.s64 %rd6306, %rd757, 36; add.s64 %rd6307, %rd7, %rd6306; st.local.u32 [%rd6307+-36], %r4869; st.local.f32 [%rd6307+-32], %f5343; mov.u16 %rs331, 0; st.local.v4.u8 [%rd6307+-28], {%rs1020, %rs331, %rs331, %rs331}; st.local.u32 [%rd6307+-24], %r220; st.local.u32 [%rd6307+-20], %r4870; st.local.u32 [%rd6307+-16], %r4871; shr.u64 %rd6308, %rd11720, 32; st.local.u32 [%rd6307+-8], %rd6308; st.local.u32 [%rd6307+-12], %rd11720; $L__BB1_370: setp.lt.u64 %p582, %rd757, 4; add.s64 %rd757, %rd757, 1; @%p582 bra $L__BB1_317; ld.local.v2.u64 {%rd11721, %rd11722}, [%rd24]; ld.local.v4.u32 {%r4881, %r4882, %r4883, %r2190}, [%rd7]; ld.local.u32 %r4884, [%rd7+16]; ld.local.u32 %rd6311, [%rd597+4]; ld.local.u32 %rd6312, [%rd597+8]; bfi.b64 %rd6313, %rd6312, %rd6311, 32, 32; mov.b64 {%r4878, %r4879}, %rd6313; ld.local.u32 %r4880, [%rd597+12]; ld.local.u32 %r4885, [%rd598+4]; ld.local.u32 %r4877, [%rd599+16]; ld.local.u64 %rd6314, [%rd599+8]; mov.b64 {%r4875, %r4876}, %rd6314; ld.local.u32 %r4886, [%rd600+8]; ld.local.u32 %rd6315, [%rd601+12]; ld.local.u32 %rd6316, [%rd601+16]; bfi.b64 %rd6317, %rd6316, %rd6315, 32, 32; mov.b64 {%r4872, %r4873}, %rd6317; ld.local.u32 %r4874, [%rd601+20]; ld.local.u32 %r4887, [%rd602+12]; bra.uni $L__BB1_372; $L__BB1_315: mov.u32 %r4884, 4; mov.u32 %r4885, %r4884; mov.u32 %r4886, %r4884; mov.u32 %r4887, %r4884; $L__BB1_372: and.b64 %rd6318, %rd753, 1; setp.eq.b64 %p583, %rd6318, 1; mov.pred %p584, 0; xor.pred %p585, %p583, %p584; not.pred %p586, %p585; mov.b64 {%r263, %r264}, %rd11721; mov.b32 %f280, %r263; mov.b32 %f281, %r264; mov.b64 {%r265, %r266}, %rd11722; mov.b32 %f282, %r265; mov.b32 %f283, %r266; @%p586 bra $L__BB1_381; bra.uni $L__BB1_373; $L__BB1_381: and.b64 %rd6336, %rd753, 2; setp.eq.s64 %p600, %rd6336, 0; @%p600 bra $L__BB1_390; bra.uni $L__BB1_382; $L__BB1_390: and.b64 %rd6354, %rd753, 4; setp.eq.s64 %p614, %rd6354, 0; @%p614 bra $L__BB1_399; bra.uni $L__BB1_391; $L__BB1_399: and.b64 %rd6372, %rd753, 8; setp.eq.s64 %p628, %rd6372, 0; @%p628 bra $L__BB1_308; ld.u8 %rs338, [%rd742+88]; and.b16 %rs339, %rs338, 1; setp.eq.b16 %p629, %rs339, 1; mov.pred %p630, 0; xor.pred %p631, %p629, %p630; not.pred %p632, %p631; @%p632 bra $L__BB1_403; bra.uni $L__BB1_401; $L__BB1_403: ld.u32 %r314, [%rd742+76]; cvt.u64.u32 %rd6376, %r314; setp.le.u64 %p639, %rd730, %rd6376; @%p639 bra $L__BB1_308; neg.f32 %f287, %f283; setp.lt.u32 %p640, %r219, 64; @%p640 bra $L__BB1_406; bra.uni $L__BB1_405; $L__BB1_406: cvta.to.local.u64 %rd11354, %rd5901; mul.wide.u32 %rd6388, %r219, 8; add.s64 %rd6389, %rd11354, %rd6388; mov.u64 %rd11729, 0; st.local.u32 [%rd6389], %r314; st.local.f32 [%rd6389+4], %f287; add.s32 %r219, %r219, 1; st.local.u32 [%rd11354+512], %r219; mov.u64 %rd11730, %rd11729; bra.uni $L__BB1_407; $L__BB1_373: ld.u8 %rs332, [%rd742+88]; and.b16 %rs333, %rs332, 1; setp.eq.b16 %p587, %rs333, 1; xor.pred %p589, %p587, %p584; not.pred %p590, %p589; @%p590 bra $L__BB1_376; bra.uni $L__BB1_374; $L__BB1_376: ld.u32 %r272, [%rd742+64]; cvt.u64.u32 %rd6322, %r272; setp.le.u64 %p597, %rd730, %rd6322; @%p597 bra $L__BB1_381; neg.f32 %f284, %f280; setp.lt.u32 %p598, %r219, 64; @%p598 bra $L__BB1_379; bra.uni $L__BB1_378; $L__BB1_379: cvta.to.local.u64 %rd11351, %rd5901; add.s32 %r2193, %r218, -1; mul.wide.u32 %rd6334, %r2193, 8; add.s64 %rd6335, %rd11351, %rd6334; mov.u64 %rd11723, 0; st.local.u32 [%rd6335], %r272; st.local.f32 [%rd6335+4], %f284; add.s32 %r219, %r219, 1; st.local.u32 [%rd11351+512], %r219; mov.u64 %rd11724, %rd11723; bra.uni $L__BB1_380; $L__BB1_382: ld.u8 %rs334, [%rd742+88]; and.b16 %rs335, %rs334, 1; setp.eq.b16 %p601, %rs335, 1; mov.pred %p602, 0; xor.pred %p603, %p601, %p602; not.pred %p604, %p603; @%p604 bra $L__BB1_385; bra.uni $L__BB1_383; $L__BB1_385: ld.u32 %r286, [%rd742+68]; cvt.u64.u32 %rd6340, %r286; setp.le.u64 %p611, %rd730, %rd6340; @%p611 bra $L__BB1_390; neg.f32 %f285, %f281; setp.lt.u32 %p612, %r219, 64; @%p612 bra $L__BB1_388; bra.uni $L__BB1_387; $L__BB1_388: cvta.to.local.u64 %rd11352, %rd5901; mul.wide.u32 %rd6352, %r219, 8; add.s64 %rd6353, %rd11352, %rd6352; mov.u64 %rd11725, 0; st.local.u32 [%rd6353], %r286; st.local.f32 [%rd6353+4], %f285; add.s32 %r219, %r219, 1; st.local.u32 [%rd11352+512], %r219; mov.u64 %rd11726, %rd11725; bra.uni $L__BB1_389; $L__BB1_391: ld.u8 %rs336, [%rd742+88]; and.b16 %rs337, %rs336, 1; setp.eq.b16 %p615, %rs337, 1; mov.pred %p616, 0; xor.pred %p617, %p615, %p616; not.pred %p618, %p617; @%p618 bra $L__BB1_394; bra.uni $L__BB1_392; $L__BB1_394: ld.u32 %r300, [%rd742+72]; cvt.u64.u32 %rd6358, %r300; setp.le.u64 %p625, %rd730, %rd6358; @%p625 bra $L__BB1_399; neg.f32 %f286, %f282; setp.lt.u32 %p626, %r219, 64; @%p626 bra $L__BB1_397; bra.uni $L__BB1_396; $L__BB1_397: cvta.to.local.u64 %rd11353, %rd5901; mul.wide.u32 %rd6370, %r219, 8; add.s64 %rd6371, %rd11353, %rd6370; mov.u64 %rd11727, 0; st.local.u32 [%rd6371], %r300; st.local.f32 [%rd6371+4], %f286; add.s32 %r219, %r219, 1; st.local.u32 [%rd11353+512], %r219; mov.u64 %rd11728, %rd11727; bra.uni $L__BB1_398; $L__BB1_374: mov.b32 %f5295, %r217; setp.leu.f32 %p591, %f5295, %f280; setp.eq.s32 %p592, %r4884, 4; or.pred %p593, %p592, %p591; @%p593 bra $L__BB1_381; ld.u32 %r2191, [%rd742+64]; cvt.u64.u32 %rd6319, %r2191; setp.le.u64 %p594, %rd733, %rd6319; mul.wide.u32 %rd6320, %r2191, 12; add.s64 %rd6321, %rd734, %rd6320; setp.eq.s64 %p595, %rd6321, 0; or.pred %p596, %p594, %p595; selp.b32 %r214, %r214, %r4883, %p596; selp.b32 %r213, %r213, %r4882, %p596; selp.b32 %r212, %r212, %r4881, %p596; selp.b32 %r216, %r216, %r4884, %p596; selp.b32 %r217, %r217, %r263, %p596; bra.uni $L__BB1_381; $L__BB1_401: mov.b32 %f2268, %r217; setp.leu.f32 %p633, %f2268, %f283; setp.eq.s32 %p634, %r4887, 4; or.pred %p635, %p634, %p633; @%p635 bra $L__BB1_308; bra.uni $L__BB1_402; $L__BB1_383: mov.b32 %f2266, %r217; setp.leu.f32 %p605, %f2266, %f281; setp.eq.s32 %p606, %r4885, 4; or.pred %p607, %p606, %p605; @%p607 bra $L__BB1_390; ld.u32 %r2199, [%rd742+68]; cvt.u64.u32 %rd6337, %r2199; setp.le.u64 %p608, %rd733, %rd6337; mul.wide.u32 %rd6338, %r2199, 12; add.s64 %rd6339, %rd734, %rd6338; setp.eq.s64 %p609, %rd6339, 0; or.pred %p610, %p608, %p609; selp.b32 %r214, %r214, %r4880, %p610; selp.b32 %r213, %r213, %r4879, %p610; selp.b32 %r212, %r212, %r4878, %p610; selp.b32 %r216, %r216, %r4885, %p610; selp.b32 %r217, %r217, %r264, %p610; bra.uni $L__BB1_390; $L__BB1_392: mov.b32 %f2267, %r217; setp.leu.f32 %p619, %f2267, %f282; setp.eq.s32 %p620, %r4886, 4; or.pred %p621, %p620, %p619; @%p621 bra $L__BB1_399; ld.u32 %r2206, [%rd742+72]; cvt.u64.u32 %rd6355, %r2206; setp.le.u64 %p622, %rd733, %rd6355; mul.wide.u32 %rd6356, %r2206, 12; add.s64 %rd6357, %rd734, %rd6356; setp.eq.s64 %p623, %rd6357, 0; or.pred %p624, %p622, %p623; selp.b32 %r214, %r214, %r4877, %p624; selp.b32 %r213, %r213, %r4876, %p624; selp.b32 %r212, %r212, %r4875, %p624; selp.b32 %r216, %r216, %r4886, %p624; selp.b32 %r217, %r217, %r265, %p624; bra.uni $L__BB1_399; $L__BB1_405: mov.u64 %rd11730, 1; shl.b64 %rd11729, %rd6376, 32; $L__BB1_407: mov.u64 %rd11056, 0; cvt.u32.u64 %r2215, %rd11056; cvt.u32.u64 %r2216, %rd11729; or.b32 %r2217, %r2216, %r2215; cvt.u32.u64 %r2218, %rd11730; or.b32 %r2219, %r2217, %r2218; setp.eq.s32 %p641, %r2219, 0; @%p641 bra $L__BB1_308; bra.uni $L__BB1_408; $L__BB1_378: mov.u64 %rd11724, 1; shl.b64 %rd11723, %rd6322, 32; $L__BB1_380: mov.u64 %rd11047, 0; cvt.u32.u64 %r2194, %rd11047; cvt.u32.u64 %r2195, %rd11723; or.b32 %r2196, %r2195, %r2194; cvt.u32.u64 %r2197, %rd11724; or.b32 %r2198, %r2196, %r2197; setp.ne.s32 %p599, %r2198, 0; @%p599 bra $L__BB1_408; bra.uni $L__BB1_381; $L__BB1_387: mov.u64 %rd11726, 1; shl.b64 %rd11725, %rd6340, 32; $L__BB1_389: mov.u64 %rd11050, 0; cvt.u32.u64 %r2201, %rd11050; cvt.u32.u64 %r2202, %rd11725; or.b32 %r2203, %r2202, %r2201; cvt.u32.u64 %r2204, %rd11726; or.b32 %r2205, %r2203, %r2204; setp.ne.s32 %p613, %r2205, 0; @%p613 bra $L__BB1_408; bra.uni $L__BB1_390; $L__BB1_396: mov.u64 %rd11728, 1; shl.b64 %rd11727, %rd6358, 32; $L__BB1_398: mov.u64 %rd11053, 0; cvt.u32.u64 %r2208, %rd11053; cvt.u32.u64 %r2209, %rd11727; or.b32 %r2210, %r2209, %r2208; cvt.u32.u64 %r2211, %rd11728; or.b32 %r2212, %r2210, %r2211; setp.ne.s32 %p627, %r2212, 0; @%p627 bra $L__BB1_408; bra.uni $L__BB1_399; $L__BB1_409: mov.u64 %rd11732, 2; mov.u64 %rd11731, 0; setp.eq.s32 %p642, %r216, 4; mov.u64 %rd11733, %rd11731; @%p642 bra $L__BB1_411; mov.b64 %rd11733, {%r212, %r213}; mov.b32 {%rs340, %rs341}, %r214; mov.b64 %rd6396, {%r214, %r2220}; and.b64 %rd11731, %rd6396, 4294967040; cvt.u64.u16 %rd6397, %rs340; and.b64 %rd11732, %rd6397, 255; $L__BB1_411: mov.u64 %rd11734, 2; mov.u64 %rd11735, 0; or.b64 %rd6404, %rd11732, %rd11731; or.b64 %rd6405, %rd6404, %rd11735; mov.b64 {%r2221, %r2222}, %rd6405; mov.b32 {%rs40, %rs342}, %r2221; and.b16 %rs343, %rs40, 255; setp.eq.s16 %p643, %rs343, 2; @%p643 bra $L__BB1_413; cvt.u32.u64 %r2223, %rd11733; mov.b32 %f2269, %r2223; shr.u64 %rd6406, %rd11733, 32; cvt.u32.u64 %r2224, %rd6406; mov.b32 %f2270, %r2224; ld.global.f32 %f2271, [%rd608+-24]; mul.f32 %f2272, %f2271, %f2269; ld.global.f32 %f2273, [%rd608+-20]; mul.f32 %f2274, %f2273, %f2270; sub.f32 %f2275, %f2272, %f2274; mul.f32 %f2276, %f2273, %f2269; fma.rn.f32 %f2277, %f2271, %f2270, %f2276; ld.global.f32 %f2278, [%rd608+-16]; add.f32 %f2279, %f2278, %f2275; mov.b32 %r2225, %f2279; ld.global.f32 %f2280, [%rd608+-12]; add.f32 %f2281, %f2280, %f2277; mov.b32 %r2226, %f2281; cvt.u64.u32 %rd6407, %r2226; cvt.u64.u32 %rd6408, %r2225; cvt.u64.u16 %rd6409, %rs40; bfi.b64 %rd11735, %rd6407, %rd6408, 32, 32; and.b64 %rd6410, %rd6409, 255; mov.b64 {%r2227, %r2228}, %rd6410; mov.b32 {%rs344, %rs345}, %r2227; cvt.u64.u16 %rd11734, %rs344; $L__BB1_413: mov.u64 %rd11065, 0; or.b64 %rd6417, %rd11065, %rd11734; or.b64 %rd1081, %rd6417, %rd11065; mov.b64 {%r2229, %r2230}, %rd1081; mov.b32 {%rs41, %rs346}, %r2229; and.b16 %rs347, %rs41, 255; setp.eq.s16 %p644, %rs347, 2; mov.u64 %rd11736, 2; mov.u64 %rd11737, %rd11065; mov.u64 %rd11738, %rd11065; @%p644 bra $L__BB1_415; and.b64 %rd6419, %rd1081, 4294967040; cvt.u64.u16 %rd6420, %rs41; and.b64 %rd6421, %rd6420, 255; or.b64 %rd6422, %rd6421, %rd11065; or.b64 %rd6423, %rd6422, %rd6419; mov.b64 {%r2231, %r2232}, %rd6423; mov.b32 {%rs348, %rs349}, %r2231; not.b16 %rs350, %rs348; ld.global.u8 %rs351, [%rd608+-32]; setp.eq.s16 %p645, %rs351, 0; and.b16 %rs352, %rs350, 1; selp.b16 %rs353, %rs348, %rs352, %p645; and.b64 %rd6424, %rd6423, 4294967040; cvt.u64.u16 %rd6425, %rs353; and.b64 %rd6426, %rd6425, 255; or.b64 %rd6427, %rd6424, %rd11065; or.b64 %rd6428, %rd6427, %rd6426; mov.b64 {%r2233, %r2234}, %rd6428; mov.b32 {%rs354, %rs355}, %r2233; and.b64 %rd11738, %rd6428, 4294967040; cvt.u64.u16 %rd6429, %rs354; and.b64 %rd11736, %rd6429, 255; mov.u64 %rd11737, %rd11735; $L__BB1_415: or.b64 %rd6430, %rd11737, %rd11065; or.b64 %rd6431, %rd11065, %rd11736; or.b64 %rd6432, %rd6431, %rd11738; or.b64 %rd6433, %rd6430, %rd11065; mov.b64 {%r4918, %r4919}, %rd6433; mov.b64 {%r4920, %r2235}, %rd6432; bra.uni $L__BB1_472; $L__BB1_275: cvt.u32.u64 %r2028, %rd610; cvt.u32.u64 %r2029, %rd629; rem.u32 %r2030, %r2029, %r2028; cvt.u64.u32 %rd11578, %r2030; $L__BB1_276: shl.b64 %rd5947, %rd11578, 3; add.s64 %rd633, %rd611, %rd5947; ld.u32 %rd5948, [%rd633]; ld.u32 %rd5949, [%rd633+4]; bfi.b64 %rd634, %rd5949, %rd5948, 32, 32; add.s64 %rd635, %rd11578, 1; or.b64 %rd5950, %rd635, %rd610; and.b64 %rd5951, %rd5950, -4294967296; setp.eq.s64 %p462, %rd5951, 0; @%p462 bra $L__BB1_278; rem.u64 %rd11579, %rd635, %rd610; bra.uni $L__BB1_279; $L__BB1_278: cvt.u32.u64 %r2031, %rd610; cvt.u32.u64 %r2032, %rd635; rem.u32 %r2033, %r2032, %r2031; cvt.u64.u32 %rd11579, %r2033; $L__BB1_279: add.u64 %rd11589, %SP, 560; cvta.to.local.u64 %rd11587, %rd11589; shl.b64 %rd5953, %rd11579, 3; add.s64 %rd645, %rd611, %rd5953; ld.u32 %rd5954, [%rd645]; ld.u32 %rd5955, [%rd645+4]; bfi.b64 %rd5956, %rd5955, %rd5954, 32, 32; st.local.v2.u64 [%rd11587], {%rd634, %rd5956}; mov.u64 %rd11594, 2; mov.u64 %rd11580, %rd592; mov.u64 %rd11581, %rd590; mov.u64 %rd11582, %rd590; mov.u64 %rd11583, %rd591; mov.u64 %rd11584, %rd590; mov.u64 %rd11585, %rd590; mov.u64 %rd11586, %rd591; mov.u64 %rd11588, %rd11587; mov.u64 %rd11590, %rd11587; mov.u64 %rd11591, %rd11587; mov.u64 %rd11592, %rd11589; mov.u64 %rd11593, %rd593; $L__BB1_280: setp.eq.s64 %p463, %rd11594, 0; @%p463 bra $L__BB1_283; add.s64 %rd11594, %rd11594, -1; add.s64 %rd5957, %rd11581, 8; setp.eq.s64 %p464, %rd11584, %rd11580; selp.b64 %rd5958, %rd5957, %rd11584, %p464; add.s64 %rd5959, %rd11582, 8; selp.b64 %rd5960, %rd5959, %rd11585, %p464; add.s64 %rd5961, %rd11583, 8; selp.b64 %rd5962, %rd5961, %rd11586, %p464; setp.eq.s64 %p465, %rd11594, 0; add.s64 %rd5963, %rd5958, 4; add.s64 %rd5964, %rd5960, 4; add.s64 %rd5965, %rd5962, 4; selp.b64 %rd662, %rd5958, %rd5963, %p465; selp.b64 %rd11585, %rd5960, %rd5964, %p465; selp.b64 %rd11586, %rd5962, %rd5965, %p465; selp.b64 %rd11581, %rd5957, %rd11581, %p464; selp.b64 %rd11582, %rd5959, %rd11582, %p464; selp.b64 %rd11583, %rd5961, %rd11583, %p464; add.s64 %rd5966, %rd11584, 8; selp.b64 %rd11580, %rd5966, %rd11580, %p464; add.s64 %rd5967, %rd11590, 8; setp.eq.s64 %p466, %rd11587, %rd11593; selp.b64 %rd5968, %rd5967, %rd11587, %p466; add.s64 %rd5969, %rd11591, 8; selp.b64 %rd5970, %rd5969, %rd11588, %p466; add.s64 %rd5971, %rd11592, 8; selp.b64 %rd5972, %rd5971, %rd11589, %p466; selp.b64 %rd11590, %rd5967, %rd11590, %p466; selp.b64 %rd11591, %rd5969, %rd11591, %p466; selp.b64 %rd11592, %rd5971, %rd11592, %p466; add.s64 %rd5973, %rd11587, 8; selp.b64 %rd11593, %rd5973, %rd11593, %p466; add.s64 %rd5974, %rd5968, 4; add.s64 %rd5975, %rd5970, 4; add.s64 %rd5976, %rd5972, 4; selp.b64 %rd11587, %rd5968, %rd5974, %p465; selp.b64 %rd11588, %rd5970, %rd5975, %p465; selp.b64 %rd11589, %rd5972, %rd5976, %p465; ld.local.f32 %f2035, [%rd5970]; ld.local.f32 %f2036, [%rd5960]; setp.eq.f32 %p467, %f2036, %f2035; mov.u64 %rd11584, %rd662; @%p467 bra $L__BB1_280; bra.uni $L__BB1_282; $L__BB1_283: ld.u32 %rd5977, [%rd633]; ld.u32 %rd5978, [%rd633+4]; bfi.b64 %rd5979, %rd5978, %rd5977, 32, 32; cvt.u32.u64 %r2034, %rd5979; mov.b32 %f2037, %r2034; shr.u64 %rd5980, %rd5979, 32; cvt.u32.u64 %r2035, %rd5980; mov.b32 %f2038, %r2035; ld.u32 %rd5981, [%rd645]; ld.u32 %rd5982, [%rd645+4]; bfi.b64 %rd5983, %rd5982, %rd5981, 32, 32; cvt.u32.u64 %r2036, %rd5983; shr.u64 %rd5984, %rd5983, 32; cvt.u32.u64 %r2037, %rd5984; mov.b32 %f2039, %r2036; sub.f32 %f5336, %f2039, %f2037; mov.b32 %f2040, %r2037; sub.f32 %f5337, %f2040, %f2038; bra.uni $L__BB1_294; $L__BB1_288: cvt.u32.u64 %r2038, %rd610; cvt.u32.u64 %r2039, %rd676; rem.u32 %r2040, %r2039, %r2038; cvt.u64.u32 %rd11595, %r2040; $L__BB1_289: shl.b64 %rd5993, %rd11595, 3; add.s64 %rd5994, %rd611, %rd5993; ld.u32 %rd5995, [%rd5994]; ld.u32 %rd5996, [%rd5994+4]; bfi.b64 %rd687, %rd5996, %rd5995, 32, 32; add.u64 %rd5998, %SPL, 560; st.local.v2.u64 [%rd5998], {%rd677, %rd687}; mov.u64 %rd11610, 2; mov.u64 %rd11596, %rd590; mov.u64 %rd11597, %rd587; mov.u64 %rd11598, %rd587; mov.u64 %rd11599, %rd589; mov.u64 %rd11600, %rd587; mov.u64 %rd11601, %rd587; mov.u64 %rd11602, %rd589; mov.u64 %rd11603, %rd594; mov.u64 %rd11604, %rd594; mov.u64 %rd11605, %rd595; mov.u64 %rd11606, %rd594; mov.u64 %rd11607, %rd594; mov.u64 %rd11608, %rd595; mov.u64 %rd11609, %rd596; $L__BB1_290: setp.eq.s64 %p471, %rd11610, 0; @%p471 bra $L__BB1_293; add.s64 %rd11610, %rd11610, -1; add.s64 %rd5999, %rd11597, 8; setp.eq.s64 %p472, %rd11600, %rd11596; selp.b64 %rd6000, %rd5999, %rd11600, %p472; add.s64 %rd6001, %rd11598, 8; selp.b64 %rd6002, %rd6001, %rd11601, %p472; add.s64 %rd6003, %rd11599, 8; selp.b64 %rd6004, %rd6003, %rd11602, %p472; setp.eq.s64 %p473, %rd11610, 0; add.s64 %rd6005, %rd6000, 4; add.s64 %rd6006, %rd6002, 4; add.s64 %rd6007, %rd6004, 4; selp.b64 %rd704, %rd6000, %rd6005, %p473; selp.b64 %rd11601, %rd6002, %rd6006, %p473; selp.b64 %rd11602, %rd6004, %rd6007, %p473; selp.b64 %rd11597, %rd5999, %rd11597, %p472; selp.b64 %rd11598, %rd6001, %rd11598, %p472; selp.b64 %rd11599, %rd6003, %rd11599, %p472; add.s64 %rd6008, %rd11600, 8; selp.b64 %rd11596, %rd6008, %rd11596, %p472; add.s64 %rd6009, %rd11606, 8; setp.eq.s64 %p474, %rd11603, %rd11609; selp.b64 %rd6010, %rd6009, %rd11603, %p474; add.s64 %rd6011, %rd11607, 8; selp.b64 %rd6012, %rd6011, %rd11604, %p474; add.s64 %rd6013, %rd11608, 8; selp.b64 %rd6014, %rd6013, %rd11605, %p474; selp.b64 %rd11606, %rd6009, %rd11606, %p474; selp.b64 %rd11607, %rd6011, %rd11607, %p474; selp.b64 %rd11608, %rd6013, %rd11608, %p474; add.s64 %rd6015, %rd11603, 8; selp.b64 %rd11609, %rd6015, %rd11609, %p474; add.s64 %rd6016, %rd6010, 4; add.s64 %rd6017, %rd6012, 4; add.s64 %rd6018, %rd6014, 4; selp.b64 %rd11603, %rd6010, %rd6016, %p473; selp.b64 %rd11604, %rd6012, %rd6017, %p473; selp.b64 %rd11605, %rd6014, %rd6018, %p473; ld.local.f32 %f2041, [%rd6012]; ld.local.f32 %f2042, [%rd6002]; setp.eq.f32 %p475, %f2042, %f2041; mov.u64 %rd11600, %rd704; @%p475 bra $L__BB1_290; bra.uni $L__BB1_292; $L__BB1_293: cvt.u32.u64 %r2041, %rd677; mov.b32 %f2043, %r2041; shr.u64 %rd6019, %rd677, 32; cvt.u32.u64 %r2042, %rd6019; mov.b32 %f2044, %r2042; shr.u64 %rd6020, %rd687, 32; cvt.u32.u64 %r2043, %rd6020; cvt.u32.u64 %r2044, %rd687; mov.b32 %f2045, %r2044; sub.f32 %f2046, %f2045, %f2043; mov.b32 %f2047, %r2043; sub.f32 %f2048, %f2047, %f2044; neg.f32 %f5336, %f2046; neg.f32 %f5337, %f2048; $L__BB1_294: mul.f32 %f2049, %f213, %f5337; fma.rn.f32 %f220, %f212, %f5336, %f2049; mul.f32 %f2050, %f5337, %f5337; fma.rn.f32 %f2051, %f5336, %f5336, %f2050; add.f32 %f2052, %f2051, 0f00000000; sqrt.rn.f32 %f2053, %f2052; mul.f32 %f2054, %f2053, 0f3A83126F; abs.f32 %f2055, %f220; setp.gt.f32 %p476, %f2055, %f2054; @%p476 bra $L__BB1_296; bra.uni $L__BB1_295; $L__BB1_296: setp.ge.f32 %p2914, %f220, 0f00000000; bra.uni $L__BB1_299; $L__BB1_295: ld.local.u64 %rd6021, [%rd609+8]; cvt.u32.u64 %r2045, %rd6021; mov.b32 %f2056, %r2045; shr.u64 %rd6022, %rd6021, 32; cvt.u32.u64 %r2046, %rd6022; mov.b32 %f2057, %r2046; sub.f32 %f2058, %f179, %f2056; sub.f32 %f2059, %f180, %f2057; mul.f32 %f2060, %f213, %f2059; fma.rn.f32 %f2061, %f212, %f2058, %f2060; setp.le.f32 %p2914, %f2061, 0f00000000; $L__BB1_299: selp.u16 %rs291, 1, 0, %p2914; st.local.u8 [%rd609+16], %rs291; $L__BB1_300: ld.local.v2.u32 {%r4856, %r4857}, [%rd609+8]; ld.local.u32 %r4858, [%rd609+16]; $L__BB1_302: setp.eq.s32 %p477, %r197, 2; mov.u64 %rd6030, 0; mov.u64 %rd11611, 2; mov.u64 %rd11612, %rd6030; @%p477 bra $L__BB1_304; setp.ne.s16 %p478, %rs25, 0; cvt.u16.u32 %rs293, %r4858; selp.u16 %rs294, 1, 0, %p478; xor.b16 %rs295, %rs293, %rs294; mov.b32 %f2068, %r4856; mov.b32 %f2069, %r4857; mul.f32 %f2070, %f183, %f2068; ld.global.f32 %f2071, [%rd608+-20]; mul.f32 %f2072, %f2071, %f2069; sub.f32 %f2073, %f2070, %f2072; mul.f32 %f2074, %f2071, %f2068; fma.rn.f32 %f2075, %f183, %f2069, %f2074; add.f32 %f2076, %f181, %f2073; mov.b32 %r2051, %f2076; add.f32 %f2077, %f182, %f2075; mov.b32 %r2052, %f2077; cvt.u64.u32 %rd6031, %r2052; cvt.u64.u32 %rd6032, %r2051; cvt.u64.u16 %rd6033, %rs295; bfi.b64 %rd11612, %rd6031, %rd6032, 32, 32; and.b64 %rd6034, %rd6033, 255; mov.b64 {%r2053, %r2054}, %rd6034; mov.b32 {%rs296, %rs297}, %r2053; cvt.u64.u16 %rd11611, %rs296; $L__BB1_304: or.b64 %rd6035, %rd6030, %rd6030; or.b64 %rd6036, %rd11611, %rd6030; or.b64 %rd6037, %rd6036, %rd6030; or.b64 %rd6038, %rd6035, %rd11612; mov.b64 {%r4918, %r4919}, %rd6038; mov.b64 {%r4920, %r2055}, %rd6037; $L__BB1_472: mov.u32 %r4921, 2; mov.u64 %rd11756, 0; mov.b32 {%rs46, %rs365}, %r4920; and.b16 %rs366, %rs46, 255; setp.eq.s16 %p723, %rs366, 2; @%p723 bra $L__BB1_474; mov.b64 %rd6515, {%r4920, %r2306}; shr.u64 %rd6516, %rd6515, 8; and.b64 %rd6517, %rd6516, 16777215; cvt.u64.u16 %rd6518, %rs46; and.b64 %rd6519, %rd6518, 255; mov.b64 %rd11756, {%r4918, %r4919}; bfi.b64 %rd1137, %rd6517, %rd6519, 8, 56; mov.b64 {%r4921, %r2307}, %rd1137; $L__BB1_474: mov.b32 {%rs367, %rs368}, %r4921; and.b16 %rs369, %rs367, 255; setp.eq.s16 %p724, %rs369, 2; cvt.u64.u16 %rd6520, %rs367; and.b64 %rd6521, %rd6520, 255; selp.b64 %rd6522, 2, %rd6521, %p724; mov.b64 %rd6523, {%r4921, %r2308}; and.b64 %rd6524, %rd6523, 4294967040; or.b64 %rd1143, %rd6524, %rd6522; mov.b64 {%r2309, %r2310}, %rd1143; mov.b32 {%rs47, %rs370}, %r2309; and.b16 %rs371, %rs47, 255; setp.eq.s16 %p725, %rs371, 2; @%p725 bra $L__BB1_476; bra.uni $L__BB1_475; $L__BB1_476: setp.ne.s64 %p726, %rd607, 0; add.s64 %rd11570, %rd605, 280; add.s64 %rd11571, %rd606, 280; @%p726 bra $L__BB1_243; $L__BB1_477: mov.u32 %r4922, 2; mov.u64 %rd11756, 0; add.s64 %rd1182, %rd605, 280; add.s64 %rd1183, %rd606, 280; bra.uni $L__BB1_478; $L__BB1_475: add.s64 %rd1182, %rd605, 280; add.s64 %rd1183, %rd606, 280; shl.b64 %rd6525, %rd1143, 16; shr.u64 %rd6526, %rd6525, 24; cvt.u64.u16 %rd6527, %rs47; and.b64 %rd6528, %rd6527, 255; bfi.b64 %rd6529, %rd6526, %rd6528, 8, 56; mov.b64 {%r4922, %r2311}, %rd6529; $L__BB1_478: mov.u64 %rd11361, 0; mov.b32 {%rs372, %rs373}, %r4922; and.b16 %rs374, %rs372, 255; setp.eq.s16 %p727, %rs374, 2; cvt.u64.u16 %rd6532, %rs372; and.b64 %rd6533, %rd6532, 255; selp.b64 %rd6534, 2, %rd6533, %p727; mov.b64 %rd6535, {%r4922, %r2316}; and.b64 %rd6536, %rd6535, 4294967040; or.b64 %rd6537, %rd6536, %rd11361; or.b64 %rd1156, %rd6537, %rd6534; mov.b64 {%r2317, %r2318}, %rd1156; mov.b32 {%rs48, %rs375}, %r2317; and.b16 %rs376, %rs48, 255; setp.eq.s16 %p728, %rs376, 2; mov.u32 %r537, 0; @%p728 bra $L__BB1_712; mov.u64 %rd11362, 0; and.b64 %rd6538, %rd1156, 4294967040; cvt.u64.u16 %rd6539, %rs48; and.b64 %rd6540, %rd6539, 255; or.b64 %rd6541, %rd6540, %rd11362; or.b64 %rd6542, %rd6541, %rd6538; mov.b64 {%r2320, %r2321}, %rd6542; mov.b32 {%rs377, %rs378}, %r2320; shr.u64 %rd6543, %rd11756, 32; cvt.u32.u64 %r2322, %rd6543; cvt.u32.u64 %r2323, %rd11756; mov.b32 %f2383, %r2323; sub.f32 %f2384, %f2383, %f179; mov.b32 %f2385, %r2322; sub.f32 %f2386, %f2385, %f180; mul.f32 %f2387, %f2386, %f2386; fma.rn.f32 %f2388, %f2384, %f2384, %f2387; add.f32 %f2389, %f2388, 0f00000000; sqrt.rn.f32 %f2390, %f2389; and.b16 %rs379, %rs377, 1; setp.eq.b16 %p729, %rs379, 1; selp.f32 %f2391, 0fBF800000, 0f3F800000, %p729; mul.f32 %f517, %f2391, %f2390; setp.eq.s64 %p730, %rd1183, 0; setp.eq.s64 %p731, %rd607, 0; or.pred %p732, %p730, %p731; mov.u32 %r537, 1; @%p732 bra $L__BB1_712; add.u64 %rd6544, %SP, 560; add.u64 %rd6545, %SPL, 560; add.s64 %rd1157, %rd6545, 8; add.u64 %rd6548, %SP, 0; add.u64 %rd6549, %SPL, 0; add.s64 %rd1158, %rd6549, 8; add.s64 %rd1159, %rd6549, 8; add.s64 %rd1160, %rd6549, 8; add.s64 %rd1161, %rd6549, 8; add.s64 %rd1162, %rd6549, 8; add.s64 %rd1163, %rd6549, 8; add.u64 %rd6560, %SP, 552; add.u64 %rd6561, %SPL, 552; add.s64 %rd1164, %rd6561, 8; add.u64 %rd6562, %SP, 32; add.u64 %rd6563, %SPL, 32; add.s64 %rd1165, %rd6563, 36; add.s64 %rd1166, %rd6563, 4; add.s64 %rd1167, %rd6562, 36; add.s64 %rd1168, %rd6563, 44; add.s64 %rd1169, %rd6562, 44; add.s64 %rd1170, %rd6563, 52; add.s64 %rd1171, %rd6545, 8; add.s64 %rd1172, %rd6545, 8; or.b64 %rd1173, %rd6544, 8; add.s64 %rd1174, %rd6545, 16; add.s64 %rd1175, %rd7, 32; add.s64 %rd1176, %rd7, 48; add.s64 %rd1177, %rd7, 64; add.s64 %rd1178, %rd7, 80; add.s64 %rd1179, %rd7, 96; add.s64 %rd1180, %rd7, 112; $L__BB1_481: add.s64 %rd607, %rd607, -1; ld.global.u32 %r2324, [%rd1182+272]; setp.eq.s32 %p733, %r2324, 3; @%p733 bra $L__BB1_711; ld.global.u16 %rs380, [%rd1182]; setp.eq.s16 %p734, %rs380, 1; @%p734 bra $L__BB1_653; setp.eq.s16 %p735, %rs380, 2; @%p735 bra $L__BB1_542; setp.ne.s16 %p736, %rs380, 3; @%p736 bra $L__BB1_691; ld.global.u8 %rs49, [%rd1182+24]; ld.global.f32 %f349, [%rd1182+256]; sub.f32 %f2392, %f179, %f349; ld.global.f32 %f350, [%rd1182+260]; sub.f32 %f2393, %f180, %f350; ld.global.f32 %f2394, [%rd1182+252]; ld.global.f32 %f351, [%rd1182+248]; mul.f32 %f2395, %f2393, %f2394; fma.rn.f32 %f352, %f2392, %f351, %f2395; mul.f32 %f2396, %f2392, %f2394; mul.f32 %f2397, %f2393, %f351; sub.f32 %f353, %f2397, %f2396; cvta.to.local.u64 %rd1186, %rd6562; mov.u32 %r383, 2; st.local.u32 [%rd1186+20], %r383; ld.global.u64 %rd1187, [%rd1182+16]; setp.eq.s64 %p737, %rd1187, 0; @%p737 bra $L__BB1_539; mov.b32 %r2339, %f353; ld.global.u64 %rd1188, [%rd1182+8]; mov.b32 %r2340, %f352; and.b32 %r2341, %r2340, 2147483647; mov.b32 %f354, %r2341; and.b32 %r2342, %r2339, 2147483647; mov.b32 %f355, %r2342; mov.u64 %rd11761, 1; bra.uni $L__BB1_487; $L__BB1_495: sub.f32 %f2409, %f5353, %f352; abs.f32 %f370, %f2409; setp.le.f32 %p747, %f370, 0f34000000; @%p747 bra $L__BB1_497; abs.f32 %f2410, %f5353; abs.f32 %f2411, %f352; setp.gt.f32 %p749, %f2411, %f2410; selp.f32 %f2412, %f2411, %f2410, %p749; mul.f32 %f2413, %f2412, 0f34000000; setp.gtu.f32 %p750, %f370, %f2413; @%p750 bra $L__BB1_501; bra.uni $L__BB1_497; $L__BB1_487: shl.b64 %rd6571, %rd11761, 3; add.s64 %rd6572, %rd1188, %rd6571; setp.eq.s64 %p738, %rd11761, %rd1187; selp.b64 %rd6573, 0, %rd11761, %p738; shl.b64 %rd6574, %rd6573, 3; add.s64 %rd6575, %rd1188, %rd6574; ld.u32 %rd6576, [%rd6572+-8]; ld.u32 %rd6577, [%rd6572+-4]; bfi.b64 %rd1191, %rd6577, %rd6576, 32, 32; ld.u32 %rd6578, [%rd6575]; ld.u32 %rd6579, [%rd6575+4]; bfi.b64 %rd1192, %rd6579, %rd6578, 32, 32; cvt.u32.u64 %r4924, %rd1191; mov.b32 %f5353, %r4924; shr.u64 %rd6580, %rd1191, 32; cvt.u32.u64 %r2345, %rd6580; mov.b32 %f358, %r2345; cvt.u32.u64 %r367, %rd1192; shr.u64 %rd6581, %rd1192, 32; cvt.u32.u64 %r2346, %rd6581; mov.b32 %f359, %r367; sub.f32 %f360, %f359, %f5353; mov.b32 %f2399, %r2346; sub.f32 %f361, %f2399, %f358; sub.f32 %f2400, %f352, %f5353; sub.f32 %f2401, %f353, %f358; mul.f32 %f2402, %f361, %f2401; fma.rn.f32 %f362, %f360, %f2400, %f2402; mul.f32 %f2403, %f361, %f361; fma.rn.f32 %f2404, %f360, %f360, %f2403; add.f32 %f363, %f2404, 0f00000000; setp.gtu.f32 %p739, %f362, 0f00000000; mov.b64 {%r2347, %r4925}, %rd1191; mov.b64 {%r2348, %r369}, %rd1192; @%p739 bra $L__BB1_489; bra.uni $L__BB1_488; $L__BB1_489: setp.ltu.f32 %p740, %f362, %f363; @%p740 bra $L__BB1_491; bra.uni $L__BB1_490; $L__BB1_491: setp.eq.f32 %p741, %f363, 0f00000000; @%p741 bra $L__BB1_538; div.rn.f32 %f2405, %f362, %f363; mov.f32 %f2406, 0f3F800000; sub.f32 %f2407, %f2406, %f2405; mov.b32 %r4927, %f2407; mov.b32 %r4928, %f2405; fma.rn.f32 %f5353, %f360, %f2405, %f5353; mov.b32 %r4924, %f5353; fma.rn.f32 %f5354, %f361, %f2405, %f358; mov.b32 %r4925, %f5354; mov.u32 %r4926, 1; bra.uni $L__BB1_493; $L__BB1_488: mov.b32 %f5354, %r4925; mov.u32 %r4926, 0; mov.u32 %r4927, %r4926; bra.uni $L__BB1_493; $L__BB1_490: mov.b32 %f5354, %r369; mov.u32 %r4927, 1; mov.u32 %r4926, 0; mov.f32 %f5353, %f359; mov.u32 %r4924, %r367; mov.u32 %r4925, %r369; $L__BB1_493: setp.eq.f32 %p742, %f352, %f5353; @%p742 bra $L__BB1_497; bra.uni $L__BB1_494; $L__BB1_497: setp.eq.f32 %p752, %f5354, %f353; mov.pred %p751, -1; mov.pred %p2919, %p751; @%p752 bra $L__BB1_501; setp.eq.f32 %p754, %f355, 0f7F800000; and.b32 %r2357, %r4925, 2147483647; mov.b32 %f2414, %r2357; setp.eq.f32 %p755, %f2414, 0f7F800000; or.pred %p756, %p754, %p755; mov.pred %p2919, 0; @%p756 bra $L__BB1_501; sub.f32 %f2415, %f5354, %f353; abs.f32 %f371, %f2415; setp.le.f32 %p758, %f371, 0f34000000; mov.pred %p2919, %p751; @%p758 bra $L__BB1_501; abs.f32 %f2416, %f5354; abs.f32 %f2417, %f353; setp.gt.f32 %p759, %f2417, %f2416; selp.f32 %f2418, %f2417, %f2416, %p759; mul.f32 %f2419, %f2418, 0f34000000; setp.le.f32 %p2919, %f371, %f2419; bra.uni $L__BB1_501; $L__BB1_494: setp.eq.f32 %p744, %f354, 0f7F800000; and.b32 %r2356, %r4924, 2147483647; mov.b32 %f2408, %r2356; setp.eq.f32 %p745, %f2408, 0f7F800000; or.pred %p746, %p744, %p745; mov.pred %p2919, 0; @%p746 bra $L__BB1_501; bra.uni $L__BB1_495; $L__BB1_501: cvt.u64.u32 %rd6582, %r4925; cvt.u64.u32 %rd6583, %r4924; bfi.b64 %rd1193, %rd6582, %rd6583, 32, 32; mov.b64 {%r2358, %r2359}, %rd1193; selp.u64 %rd1194, 1, 0, %p2919; mov.b32 %f373, %r2359; mov.b32 %f372, %r2358; sub.f32 %f2420, %f372, %f352; sub.f32 %f2421, %f373, %f353; mul.f32 %f2422, %f2421, %f2421; fma.rn.f32 %f2423, %f2420, %f2420, %f2422; add.f32 %f2424, %f2423, 0f00000000; sqrt.rn.f32 %f375, %f2424; setp.geu.f32 %p760, %f375, %f5355; setp.ne.s32 %p761, %r383, 2; and.pred %p762, %p761, %p760; @%p762 bra $L__BB1_503; add.s64 %rd11762, %rd11761, -1; st.local.u64 [%rd1186], %rd11762; st.local.v2.f32 [%rd1186+8], {%f372, %f373}; mov.b64 {%r2362, %r2363}, %rd1194; st.local.v2.u32 [%rd1186+16], {%r2362, %r4926}; st.local.v2.u32 [%rd1186+24], {%r4927, %r4928}; st.local.f32 [%rd1186+32], %f375; st.local.u32 [%rd1186+36], %rd1191; st.local.u32 [%rd1186+44], %rd1192; st.local.u32 [%rd1186+40], %rd6580; st.local.u32 [%rd1186+48], %rd6581; mov.f32 %f5355, %f375; mov.u32 %r383, %r4926; $L__BB1_503: add.s64 %rd1197, %rd11761, 1; setp.lt.u64 %p763, %rd11761, %rd1187; mov.u64 %rd11761, %rd1197; @%p763 bra $L__BB1_487; ld.local.u32 %rd6590, [%rd1186+36]; ld.local.u32 %rd6591, [%rd1186+40]; bfi.b64 %rd6592, %rd6591, %rd6590, 32, 32; mov.u64 %rd6589, 0; cvt.u32.u64 %r2364, %rd6592; mov.b32 %f2425, %r2364; shr.u64 %rd6593, %rd6592, 32; cvt.u32.u64 %r2365, %rd6593; mov.b32 %f2426, %r2365; ld.local.u32 %rd6594, [%rd1186+44]; ld.local.u32 %rd6595, [%rd1186+48]; bfi.b64 %rd6596, %rd6595, %rd6594, 32, 32; cvt.u32.u64 %r2366, %rd6596; shr.u64 %rd6597, %rd6596, 32; cvt.u32.u64 %r2367, %rd6597; mov.b32 %f2427, %r2366; sub.f32 %f377, %f2427, %f2425; mov.b32 %f2428, %r2367; sub.f32 %f378, %f2428, %f2426; mul.f32 %f2429, %f378, %f378; fma.rn.f32 %f2430, %f377, %f377, %f2429; add.f32 %f379, %f2430, 0f00000000; setp.leu.f32 %p764, %f379, 0f28800000; mov.u64 %rd11763, %rd6589; mov.u64 %rd11764, %rd6589; mov.u64 %rd11765, %rd6589; @%p764 bra $L__BB1_506; neg.f32 %f2431, %f377; sqrt.rn.f32 %f2432, %f379; div.rn.f32 %f2433, %f378, %f2432; div.rn.f32 %f2434, %f2431, %f2432; mov.b32 %r2368, %f2434; mov.b32 %r2369, %f2433; mov.u64 %rd11765, 1; mov.b64 %rd6600, {%r2369, %r2368}; shr.u64 %rd11764, %rd6600, 32; shl.b64 %rd11763, %rd6600, 32; $L__BB1_506: or.b64 %rd1204, %rd11765, %rd11763; or.b64 %rd1205, %rd6589, %rd11764; and.b64 %rd6601, %rd6589, 4294967295; xor.b64 %rd6602, %rd11765, 1; or.b64 %rd6603, %rd6602, %rd6601; setp.ne.s64 %p765, %rd6603, 0; @%p765 bra $L__BB1_537; mov.b64 {%r2370, %r2371}, %rd1205; mov.b64 {%r2372, %r2373}, %rd1204; mov.b32 %f380, %r2373; mov.b32 %f381, %r2370; setp.eq.s32 %p766, %r383, 1; @%p766 bra $L__BB1_535; bra.uni $L__BB1_508; $L__BB1_535: ld.local.u64 %rd6682, [%rd1186+8]; cvt.u32.u64 %r2394, %rd6682; mov.b32 %f2462, %r2394; shr.u64 %rd6683, %rd6682, 32; cvt.u32.u64 %r2395, %rd6683; mov.b32 %f2463, %r2395; sub.f32 %f2464, %f179, %f2462; sub.f32 %f2465, %f180, %f2463; mul.f32 %f2466, %f381, %f2465; fma.rn.f32 %f2467, %f380, %f2464, %f2466; setp.le.f32 %p2920, %f2467, 0f00000000; bra.uni $L__BB1_536; $L__BB1_542: ld.global.f32 %f2478, [%rd1182+256]; mov.u64 %rd6703, 0; sub.f32 %f2479, %f179, %f2478; ld.global.f32 %f2480, [%rd1182+260]; sub.f32 %f2481, %f180, %f2480; ld.global.f32 %f2482, [%rd1182+252]; ld.global.f32 %f2483, [%rd1182+248]; mul.f32 %f2484, %f2481, %f2482; fma.rn.f32 %f389, %f2479, %f2483, %f2484; mul.f32 %f2485, %f2479, %f2482; mul.f32 %f2486, %f2481, %f2483; sub.f32 %f390, %f2486, %f2485; mov.b32 %r2403, %f389; mov.b32 %r2404, %f390; cvt.u64.u32 %rd6704, %r2404; cvt.u64.u32 %rd6705, %r2403; bfi.b64 %rd6706, %rd6704, %rd6705, 32, 32; st.local.u64 [%rd6561], %rd6706; ld.global.u64 %rd1307, [%rd1182+32]; setp.eq.s64 %p787, %rd1307, 0; mov.u64 %rd6701, 2; mov.u64 %rd11919, %rd6703; mov.u64 %rd11920, %rd6701; mov.u64 %rd11921, %rd6703; @%p787 bra $L__BB1_648; cvta.to.local.u64 %rd1308, %rd6562; mov.u32 %r2411, 0; st.local.u32 [%rd1308], %r2411; mov.u32 %r2412, -16777217; st.local.u32 [%rd1308+4], %r2412; mov.u32 %r405, 1; st.local.u32 [%rd1308+512], %r405; ld.global.u64 %rd1309, [%rd1182+24]; ld.global.u64 %rd1310, [%rd1182+80]; ld.global.u64 %rd1311, [%rd1182+72]; mov.u32 %r403, 2139095039; mov.u32 %r402, 4; bra.uni $L__BB1_544; $L__BB1_653: ld.global.f32 %f456, [%rd1182+256]; sub.f32 %f2682, %f179, %f456; ld.global.f32 %f457, [%rd1182+260]; sub.f32 %f2683, %f180, %f457; ld.global.f32 %f2684, [%rd1182+252]; ld.global.f32 %f458, [%rd1182+248]; mul.f32 %f2685, %f2683, %f2684; fma.rn.f32 %f459, %f2682, %f458, %f2685; mul.f32 %f2686, %f2682, %f2684; mul.f32 %f2687, %f2683, %f458; sub.f32 %f460, %f2687, %f2686; mov.b32 %r506, %f459; mov.b32 %r507, %f460; ld.global.u64 %rd1670, [%rd1182+56]; ld.global.u64 %rd1669, [%rd1182+48]; sub.f32 %f2688, %f459, %f6; sub.f32 %f2689, %f460, %f6; mov.b32 %r2583, %f2688; mov.b32 %r2584, %f2689; cvt.u64.u32 %rd7093, %r2584; cvt.u64.u32 %rd7094, %r2583; add.f32 %f2690, %f6, %f459; add.f32 %f2691, %f6, %f460; mov.b32 %r2585, %f2690; mov.b32 %r2586, %f2691; cvt.u64.u32 %rd7095, %r2586; cvt.u64.u32 %rd7096, %r2585; bfi.b64 %rd7097, %rd7093, %rd7094, 32, 32; mov.b64 {%r2587, %r2588}, %rd7097; bfi.b64 %rd7098, %rd7095, %rd7096, 32, 32; mov.b64 {%r2589, %r2590}, %rd7098; cvta.to.local.u64 %rd1671, %rd6562; mov.u16 %rs446, 2; st.local.u8 [%rd1671+8], %rs446; mov.b32 %f464, %r2590; mov.b32 %f462, %r2588; mov.b32 %f463, %r2589; mov.b32 %f461, %r2587; ld.global.v2.f32 {%f2692, %f2693}, [%rd1182+40]; div.rn.f32 %f467, %f461, %f2692; div.rn.f32 %f468, %f463, %f2692; ld.global.u64 %rd1672, [%rd1182+16]; cvt.rn.f32.u64 %f2694, %rd1672; add.f32 %f2695, %f2694, 0fBF800000; rcp.rn.f32 %f469, %f2695; setp.lt.f32 %p954, %f468, 0fBF000000; setp.gt.f32 %p955, %f467, 0f3F000000; or.pred %p956, %p955, %p954; @%p956 bra $L__BB1_685; add.f32 %f2696, %f467, 0f3F000000; div.rn.f32 %f2697, %f2696, %f469; cvt.rmi.f32.f32 %f2698, %f2697; add.s64 %rd7100, %rd1672, -2; cvt.rn.f32.u64 %f2699, %rd7100; setp.gt.f32 %p957, %f2698, 0f00000000; setp.lt.f32 %p958, %f2698, %f2699; selp.f32 %f2700, %f2698, %f2699, %p958; selp.f32 %f2701, %f2700, 0f00000000, %p957; setp.gt.f32 %p959, %f2701, 0f5F7FFFFF; max.f32 %f2702, %f2701, 0f00000000; cvt.rzi.u64.f32 %rd7101, %f2702; selp.b64 %rd1678, -1, %rd7101, %p959; add.f32 %f2703, %f468, 0f3F000000; div.rn.f32 %f2704, %f2703, %f469; cvt.rpi.f32.f32 %f2705, %f2704; add.s64 %rd7102, %rd1672, -1; cvt.rn.f32.u64 %f2706, %rd7102; setp.gt.f32 %p960, %f2705, 0f00000000; setp.lt.f32 %p961, %f2705, %f2706; selp.f32 %f2707, %f2705, %f2706, %p961; selp.f32 %f2708, %f2707, 0f00000000, %p960; setp.gt.f32 %p962, %f2708, 0f5F7FFFFF; max.f32 %f2709, %f2708, 0f00000000; cvt.rzi.u64.f32 %rd7103, %f2709; selp.b64 %rd1674, -1, %rd7103, %p962; setp.ge.u64 %p963, %rd1678, %rd1674; @%p963 bra $L__BB1_685; div.rn.f32 %f470, %f462, %f2693; div.rn.f32 %f471, %f464, %f2693; ld.global.u64 %rd1675, [%rd1182+32]; ld.global.u64 %rd1676, [%rd1182+24]; ld.global.u64 %rd1677, [%rd1182+8]; and.b32 %r2591, %r506, 2147483647; mov.b32 %f472, %r2591; and.b32 %r2592, %r507, 2147483647; mov.b32 %f473, %r2592; ld.local.v4.u32 {%r4989, %r4990, %r4991, %r2596}, [%rd1671]; mov.f32 %f5367, 0f7F7FFFFF; bra.uni $L__BB1_656; $L__BB1_691: ld.global.f32 %f498, [%rd1182+256]; sub.f32 %f2751, %f179, %f498; ld.global.f32 %f499, [%rd1182+260]; sub.f32 %f2752, %f180, %f499; ld.global.f32 %f500, [%rd1182+252]; ld.global.f32 %f501, [%rd1182+248]; mul.f32 %f2753, %f2752, %f500; fma.rn.f32 %f502, %f2751, %f501, %f2753; mul.f32 %f2754, %f2751, %f500; mul.f32 %f2755, %f2752, %f501; sub.f32 %f503, %f2755, %f2754; ld.global.u32 %rd7130, [%rd1182+8]; ld.global.u32 %rd7131, [%rd1182+12]; bfi.b64 %rd7132, %rd7131, %rd7130, 32, 32; cvt.u32.u64 %r2635, %rd7132; mov.b32 %f2756, %r2635; shr.u64 %rd7133, %rd7132, 32; cvt.u32.u64 %r2636, %rd7133; mov.b32 %f2757, %r2636; neg.f32 %f2758, %f2756; neg.f32 %f2759, %f2757; sub.f32 %f504, %f2758, %f502; sub.f32 %f505, %f2759, %f503; sub.f32 %f506, %f502, %f2756; sub.f32 %f507, %f503, %f2757; setp.ge.f32 %p1012, %f504, 0f00000000; selp.f32 %f2760, %f504, 0f00000000, %p1012; setp.ge.f32 %p1013, %f505, 0f00000000; selp.f32 %f2761, %f505, 0f00000000, %p1013; setp.ge.f32 %p1014, %f506, 0f00000000; selp.f32 %f2762, %f506, 0f00000000, %p1014; setp.ge.f32 %p1015, %f507, 0f00000000; selp.f32 %f2763, %f507, 0f00000000, %p1015; sub.f32 %f508, %f2760, %f2762; mov.b32 %r2637, %f508; sub.f32 %f509, %f2761, %f2763; mov.b32 %r2638, %f509; cvt.u64.u32 %rd7134, %r2638; cvt.u64.u32 %rd7135, %r2637; bfi.b64 %rd7136, %rd7134, %rd7135, 32, 32; st.local.u64 [%rd6545], %rd7136; mov.u64 %rd11935, 2; mov.u64 %rd11928, %rd1157; mov.u64 %rd11929, %rd6545; mov.u64 %rd11930, %rd6545; mov.u64 %rd11931, %rd6544; mov.u64 %rd11932, %rd6545; mov.u64 %rd11933, %rd6545; mov.u64 %rd11934, %rd6544; $L__BB1_692: setp.eq.s64 %p1016, %rd11935, 0; @%p1016 bra $L__BB1_695; add.s64 %rd11935, %rd11935, -1; add.s64 %rd7137, %rd11932, 8; setp.eq.s64 %p1017, %rd11932, %rd11928; selp.b64 %rd11928, %rd7137, %rd11928, %p1017; add.s64 %rd7138, %rd11929, 8; selp.b64 %rd11929, %rd7138, %rd11929, %p1017; add.s64 %rd7139, %rd11930, 8; selp.b64 %rd11930, %rd7139, %rd11930, %p1017; add.s64 %rd7140, %rd11931, 8; selp.b64 %rd11931, %rd7140, %rd11931, %p1017; selp.b64 %rd7141, %rd7138, %rd11932, %p1017; selp.b64 %rd7142, %rd7139, %rd11933, %p1017; selp.b64 %rd7143, %rd7140, %rd11934, %p1017; setp.eq.s64 %p1018, %rd11935, 0; add.s64 %rd7144, %rd7141, 4; add.s64 %rd7145, %rd7142, 4; add.s64 %rd7146, %rd7143, 4; selp.b64 %rd11932, %rd7141, %rd7144, %p1018; selp.b64 %rd11933, %rd7142, %rd7145, %p1018; selp.b64 %rd11934, %rd7143, %rd7146, %p1018; ld.local.f32 %f2764, [%rd7142]; setp.eq.f32 %p1019, %f2764, 0f00000000; @%p1019 bra $L__BB1_692; add.f32 %f2765, %f502, %f508; mov.b32 %r2639, %f2765; add.f32 %f2766, %f503, %f509; mov.b32 %r2640, %f2766; cvt.u64.u32 %rd7149, %r2640; cvt.u64.u32 %rd7150, %r2639; bfi.b64 %rd11938, %rd7149, %rd7150, 32, 32; mov.u64 %rd11939, 0; bra.uni $L__BB1_708; $L__BB1_695: setp.lt.f32 %p1020, %f504, %f506; mov.f32 %f5368, 0fFF7FFFFF; @%p1020 bra $L__BB1_698; bra.uni $L__BB1_696; $L__BB1_698: setp.leu.f32 %p1025, %f506, 0fFF7FFFFF; mov.pred %p2924, 0; @%p1025 bra $L__BB1_700; mov.f32 %f5368, %f506; bra.uni $L__BB1_700; $L__BB1_696: setp.leu.f32 %p1022, %f504, 0fFF7FFFFF; mov.pred %p2924, 0; @%p1022 bra $L__BB1_700; mov.pred %p2924, -1; mov.f32 %f5368, %f504; $L__BB1_700: setp.lt.f32 %p1027, %f505, %f507; @%p1027 bra $L__BB1_703; bra.uni $L__BB1_701; $L__BB1_703: setp.gt.f32 %p1029, %f507, %f5368; @%p1029 bra $L__BB1_706; bra.uni $L__BB1_704; $L__BB1_706: cvta.to.local.u64 %rd7157, %rd6562; mov.u64 %rd7158, 0; st.local.u64 [%rd7157], %rd7158; neg.f32 %f5370, %f507; mov.u64 %rd11937, %rd1166; bra.uni $L__BB1_707; $L__BB1_701: setp.leu.f32 %p1028, %f505, %f5368; @%p1028 bra $L__BB1_704; mov.u64 %rd7153, 0; st.local.u64 [%rd6563], %rd7153; mov.u64 %rd11937, %rd1166; mov.f32 %f5368, %f505; bra.uni $L__BB1_705; $L__BB1_704: mov.u64 %rd7155, 0; st.local.u64 [%rd6563], %rd7155; neg.f32 %f5370, %f5368; not.pred %p1030, %p2924; mov.u64 %rd11937, %rd6563; @%p1030 bra $L__BB1_707; $L__BB1_705: mov.f32 %f5370, %f5368; $L__BB1_707: st.local.f32 [%rd11937], %f5370; ld.local.u64 %rd7163, [%rd6563]; cvt.u32.u64 %r2641, %rd7163; mov.b32 %f2769, %r2641; shr.u64 %rd7164, %rd7163, 32; cvt.u32.u64 %r2642, %rd7164; mov.b32 %f2770, %r2642; add.f32 %f2771, %f502, %f2769; add.f32 %f2772, %f503, %f2770; mov.b32 %r2643, %f2771; mov.b32 %r2644, %f2772; cvt.u64.u32 %rd7165, %r2644; cvt.u64.u32 %rd7166, %r2643; bfi.b64 %rd11938, %rd7165, %rd7166, 32, 32; mov.u64 %rd11939, 1; $L__BB1_708: mov.u64 %rd11104, 0; cvt.u32.u64 %r2645, %rd11938; mov.b32 %f2773, %r2645; shr.u64 %rd7167, %rd11938, 32; cvt.u32.u64 %r2646, %rd7167; mov.b32 %f2774, %r2646; mul.f32 %f2775, %f501, %f2773; mul.f32 %f2776, %f500, %f2774; sub.f32 %f2777, %f2775, %f2776; mul.f32 %f2778, %f501, %f2774; fma.rn.f32 %f2779, %f500, %f2773, %f2778; add.f32 %f2780, %f498, %f2777; mov.b32 %r2647, %f2780; add.f32 %f2781, %f499, %f2779; mov.b32 %r2648, %f2781; cvt.u64.u32 %rd7168, %r2648; cvt.u64.u32 %rd7169, %r2647; bfi.b64 %rd7170, %rd7168, %rd7169, 32, 32; or.b64 %rd7171, %rd11104, %rd7170; mov.b64 {%r4992, %r4993}, %rd7171; mov.b64 {%r4994, %r2649}, %rd11939; bra.uni $L__BB1_709; $L__BB1_673: sub.f32 %f2722, %f5365, %f459; abs.f32 %f491, %f2722; setp.le.f32 %p982, %f491, 0f34000000; @%p982 bra $L__BB1_675; abs.f32 %f2723, %f5365; abs.f32 %f2724, %f459; setp.gt.f32 %p984, %f2724, %f2723; selp.f32 %f2725, %f2724, %f2723, %p984; mul.f32 %f2726, %f2725, 0f34000000; setp.gtu.f32 %p985, %f491, %f2726; @%p985 bra $L__BB1_679; bra.uni $L__BB1_675; $L__BB1_656: setp.gt.u64 %p964, %rd1675, %rd1678; @%p964 bra $L__BB1_658; bra.uni $L__BB1_657; $L__BB1_658: add.s64 %rd7104, %rd1676, %rd1678; ld.u8 %rs447, [%rd7104]; setp.eq.s16 %p965, %rs447, 0; @%p965 bra $L__BB1_683; cvt.rn.f32.u64 %f2711, %rd1678; fma.rn.f32 %f475, %f469, %f2711, 0fBF000000; setp.gt.u64 %p966, %rd1672, %rd1678; @%p966 bra $L__BB1_661; bra.uni $L__BB1_660; $L__BB1_661: shl.b64 %rd7105, %rd1678, 2; add.s64 %rd1679, %rd1677, %rd7105; ld.f32 %f476, [%rd1679]; add.s64 %rd7106, %rd1678, 1; setp.gt.u64 %p967, %rd1672, %rd7106; @%p967 bra $L__BB1_663; bra.uni $L__BB1_662; $L__BB1_663: ld.f32 %f477, [%rd1679+4]; setp.gt.f32 %p968, %f477, %f471; setp.gt.f32 %p969, %f476, %f471; and.pred %p970, %p969, %p968; @%p970 bra $L__BB1_683; setp.lt.f32 %p971, %f476, %f470; setp.lt.f32 %p972, %f477, %f470; and.pred %p973, %p971, %p972; @%p973 bra $L__BB1_683; mul.f32 %f2712, %f2692, %f475; mov.b32 %r2597, %f2712; mul.f32 %f480, %f2693, %f476; mov.b32 %r2598, %f480; cvt.u64.u32 %rd7107, %r2598; cvt.u64.u32 %rd7108, %r2597; add.f32 %f2713, %f469, %f475; mul.f32 %f478, %f2692, %f2713; mov.b32 %r514, %f478; mul.f32 %f2714, %f2693, %f477; mov.b32 %r2599, %f2714; cvt.u64.u32 %rd7109, %r2599; cvt.u64.u32 %rd7110, %r514; bfi.b64 %rd7111, %rd7109, %rd7110, 32, 32; bfi.b64 %rd7112, %rd7107, %rd7108, 32, 32; cvt.u32.u64 %r4987, %rd7112; mov.b32 %f5365, %r4987; sub.f32 %f481, %f478, %f5365; sub.f32 %f482, %f2714, %f480; sub.f32 %f2715, %f459, %f5365; sub.f32 %f2716, %f460, %f480; mul.f32 %f2717, %f482, %f2716; fma.rn.f32 %f483, %f481, %f2715, %f2717; mul.f32 %f2718, %f482, %f482; fma.rn.f32 %f2719, %f481, %f481, %f2718; add.f32 %f484, %f2719, 0f00000000; setp.gtu.f32 %p974, %f483, 0f00000000; mov.b64 {%r2600, %r4988}, %rd7112; mov.b64 {%r2601, %r517}, %rd7111; @%p974 bra $L__BB1_667; bra.uni $L__BB1_666; $L__BB1_667: setp.ltu.f32 %p975, %f483, %f484; @%p975 bra $L__BB1_669; bra.uni $L__BB1_668; $L__BB1_669: setp.eq.f32 %p976, %f484, 0f00000000; @%p976 bra $L__BB1_682; div.rn.f32 %f2720, %f483, %f484; fma.rn.f32 %f5365, %f481, %f2720, %f5365; mov.b32 %r4987, %f5365; fma.rn.f32 %f5366, %f482, %f2720, %f480; mov.b32 %r4988, %f5366; bra.uni $L__BB1_671; $L__BB1_666: mov.b32 %f5366, %r4988; bra.uni $L__BB1_671; $L__BB1_668: mov.b32 %f5366, %r517; mov.f32 %f5365, %f478; mov.u32 %r4987, %r514; mov.u32 %r4988, %r517; $L__BB1_671: setp.eq.f32 %p977, %f459, %f5365; @%p977 bra $L__BB1_675; bra.uni $L__BB1_672; $L__BB1_675: setp.eq.f32 %p987, %f5366, %f460; mov.pred %p986, -1; mov.pred %p2922, %p986; @%p987 bra $L__BB1_679; setp.eq.f32 %p989, %f473, 0f7F800000; and.b32 %r2603, %r4988, 2147483647; mov.b32 %f2727, %r2603; setp.eq.f32 %p990, %f2727, 0f7F800000; or.pred %p991, %p989, %p990; mov.pred %p2922, 0; @%p991 bra $L__BB1_679; sub.f32 %f2728, %f5366, %f460; abs.f32 %f492, %f2728; setp.le.f32 %p993, %f492, 0f34000000; mov.pred %p2922, %p986; @%p993 bra $L__BB1_679; abs.f32 %f2729, %f5366; abs.f32 %f2730, %f460; setp.gt.f32 %p994, %f2730, %f2729; selp.f32 %f2731, %f2730, %f2729, %p994; mul.f32 %f2732, %f2731, 0f34000000; setp.le.f32 %p2922, %f492, %f2732; bra.uni $L__BB1_679; $L__BB1_672: setp.eq.f32 %p979, %f472, 0f7F800000; and.b32 %r2602, %r4987, 2147483647; mov.b32 %f2721, %r2602; setp.eq.f32 %p980, %f2721, 0f7F800000; or.pred %p981, %p979, %p980; mov.pred %p2922, 0; @%p981 bra $L__BB1_679; bra.uni $L__BB1_673; $L__BB1_679: cvt.u64.u32 %rd7113, %r4988; cvt.u64.u32 %rd7114, %r4987; bfi.b64 %rd1680, %rd7113, %rd7114, 32, 32; mov.b64 {%r2604, %r2605}, %rd1680; selp.u64 %rd1681, 1, 0, %p2922; mov.b32 %f2733, %r2604; sub.f32 %f2734, %f2733, %f459; mov.b32 %f2735, %r2605; sub.f32 %f2736, %f2735, %f460; mul.f32 %f2737, %f2736, %f2736; fma.rn.f32 %f2738, %f2734, %f2734, %f2737; add.f32 %f493, %f2738, 0f00000000; setp.geu.f32 %p995, %f493, %f5367; @%p995 bra $L__BB1_683; sqrt.rn.f32 %f2739, %f493; setp.gtu.f32 %p996, %f2739, %f6; mov.f32 %f5367, %f493; @%p996 bra $L__BB1_683; mov.b64 {%r4991, %r2606}, %rd1681; mov.u32 %r4989, %r2604; mov.u32 %r4990, %r2605; mov.f32 %f5367, %f493; $L__BB1_683: add.s64 %rd1678, %rd1678, 1; setp.lt.u64 %p997, %rd1678, %rd1674; @%p997 bra $L__BB1_656; st.local.u32 [%rd1671+8], %r4991; mov.b64 %rd7115, {%r4989, %r4990}; st.local.u64 [%rd1671], %rd7115; $L__BB1_685: cvt.u64.u32 %rd7116, %r506; cvt.u64.u32 %rd7117, %r507; bfi.b64 %rd1683, %rd7117, %rd7116, 32, 32; ld.local.v4.u32 {%r2610, %r2611, %r2612, %r2613}, [%rd1671]; mov.b64 %rd1685, {%r2612, %r2613}; mov.b64 %rd1684, {%r2610, %r2611}; mov.b32 {%rs448, %rs449}, %r2612; and.b16 %rs450, %rs448, 255; setp.eq.s16 %p998, %rs450, 2; cvt.u64.u16 %rd7118, %rs448; and.b64 %rd7119, %rd7118, 255; selp.b64 %rd7120, 2, %rd7119, %p998; and.b64 %rd7121, %rd1685, 4294967040; or.b64 %rd7122, %rd7121, %rd7120; mov.b64 {%r2618, %r2619}, %rd7122; mov.b32 {%rs1024, %rs451}, %r2618; and.b16 %rs452, %rs1024, 255; setp.eq.s16 %p999, %rs452, 2; mov.u32 %r4994, 2; mov.u32 %r4992, 0; mov.u32 %r4993, %r4992; @%p999 bra $L__BB1_709; ld.global.u8 %rs453, [%rd1182+64]; setp.eq.s16 %p1000, %rs453, 0; shr.u64 %rd7123, %rd1684, 32; cvt.u32.u64 %r2620, %rd7123; mov.b32 %f495, %r2620; @%p1000 bra $L__BB1_690; mov.b64 {%r2621, %r2622}, %rd1683; mov.b32 %f497, %r2622; mov.b32 %f496, %r2621; mov.b64 {%r2623, %r2624}, %rd1669; mov.b64 {%r2625, %r2626}, %rd1670; ld.global.u8 %rs67, [%rd1182+65]; mov.b32 %f2740, %r2625; setp.gt.f32 %p1002, %f496, %f2740; mov.b32 %f2741, %r2623; setp.lt.f32 %p1003, %f496, %f2741; or.pred %p1004, %p1003, %p1002; mov.pred %p2923, 0; @%p1004 bra $L__BB1_689; setp.geu.f32 %p1005, %f497, 0fFF7FFFFF; setp.leu.f32 %p1006, %f497, 0f7F7FFFFF; and.pred %p2923, %p1006, %p1005; $L__BB1_689: setp.ge.f32 %p1007, %f460, %f495; setp.le.f32 %p1008, %f460, %f495; setp.eq.s16 %p1009, %rs67, 0; selp.u32 %r2627, -1, 0, %p1007; selp.u32 %r2628, -1, 0, %p1008; selp.b32 %r2629, %r2628, %r2627, %p1009; and.b32 %r2630, %r2629, 1; setp.eq.b32 %p1010, %r2630, 1; and.pred %p1011, %p1010, %p2923; selp.u16 %rs1024, 1, 0, %p1011; $L__BB1_690: cvt.u32.u64 %r2631, %rd1684; mov.b32 %f2742, %r2631; mul.f32 %f2743, %f458, %f2742; ld.global.f32 %f2744, [%rd1182+252]; mul.f32 %f2745, %f2744, %f495; sub.f32 %f2746, %f2743, %f2745; mul.f32 %f2747, %f2744, %f2742; fma.rn.f32 %f2748, %f458, %f495, %f2747; add.f32 %f2749, %f456, %f2746; mov.b32 %r2632, %f2749; add.f32 %f2750, %f457, %f2748; mov.b32 %r2633, %f2750; cvt.u64.u32 %rd7124, %r2633; cvt.u64.u32 %rd7125, %r2632; cvt.u64.u16 %rd7126, %rs1024; bfi.b64 %rd7127, %rd7124, %rd7125, 32, 32; and.b64 %rd7128, %rd7126, 255; mov.b64 {%r4992, %r4993}, %rd7127; mov.b64 {%r4994, %r2634}, %rd7128; bra.uni $L__BB1_709; $L__BB1_508: ld.local.u32 %r2374, [%rd1186+24]; setp.eq.s32 %p767, %r2374, 0; @%p767 bra $L__BB1_521; setp.ne.s32 %p768, %r2374, 1; @%p768 bra $L__BB1_534; add.s64 %rd1206, %rd11762, 1; or.b64 %rd6604, %rd1206, %rd1187; and.b64 %rd6605, %rd6604, -4294967296; setp.eq.s64 %p769, %rd6605, 0; @%p769 bra $L__BB1_512; rem.u64 %rd11766, %rd1206, %rd1187; bra.uni $L__BB1_513; $L__BB1_521: setp.eq.s64 %p776, %rd11762, 0; selp.b64 %rd1253, %rd1187, %rd11762, %p776; add.s64 %rd6644, %rd1253, -1; setp.gt.u64 %p777, %rd1187, %rd6644; @%p777 bra $L__BB1_523; bra.uni $L__BB1_522; $L__BB1_523: shl.b64 %rd6645, %rd1253, 3; add.s64 %rd6646, %rd1188, %rd6645; ld.u32 %rd6647, [%rd6646+-8]; ld.u32 %rd6648, [%rd6646+-4]; bfi.b64 %rd1254, %rd6648, %rd6647, 32, 32; or.b64 %rd6649, %rd1253, %rd1187; and.b64 %rd6650, %rd6649, -4294967296; setp.eq.s64 %p778, %rd6650, 0; @%p778 bra $L__BB1_525; rem.u64 %rd11783, %rd1253, %rd1187; bra.uni $L__BB1_526; $L__BB1_639: ld.u32 %r2560, [%rd1319+76]; cvt.u64.u32 %rd7032, %r2560; setp.le.u64 %p944, %rd1310, %rd7032; mul.wide.u32 %rd7033, %r2560, 12; add.s64 %rd7034, %rd1311, %rd7033; setp.eq.s64 %p945, %rd7034, 0; or.pred %p946, %p944, %p945; selp.b32 %r400, %r400, %r4948, %p946; selp.b32 %r399, %r399, %r4947, %p946; selp.b32 %r398, %r398, %r4946, %p946; selp.b32 %r402, %r402, %r4961, %p946; selp.b32 %r403, %r403, %r452, %p946; $L__BB1_544: mov.b32 %f391, %r403; $L__BB1_545: mov.u32 %r404, %r405; setp.eq.s32 %p788, %r404, 0; @%p788 bra $L__BB1_646; cvt.u64.u32 %rd6713, %r404; add.s64 %rd6714, %rd6713, -1; cvt.u32.u64 %r405, %rd6714; st.local.u32 [%rd1308+512], %r405; mul.wide.u32 %rd6715, %r404, 8; add.s64 %rd6716, %rd1308, %rd6715; ld.local.u32 %rd1317, [%rd6716+-4]; ld.local.u32 %rd6717, [%rd6716+-8]; shl.b64 %rd6718, %rd6717, 32; or.b64 %rd1316, %rd6718, 1; mov.b64 {%r2416, %r2417}, %rd1317; mov.b32 %f2487, %r2416; neg.f32 %f2488, %f2487; setp.le.f32 %p789, %f391, %f2488; @%p789 bra $L__BB1_545; mov.b64 {%r2418, %r2419}, %rd1316; cvt.u64.u32 %rd1318, %r2419; setp.gt.u64 %p790, %rd1307, %rd1318; @%p790 bra $L__BB1_549; bra.uni $L__BB1_548; $L__BB1_549: mul.lo.s64 %rd6719, %rd1318, 96; add.s64 %rd1319, %rd1309, %rd6719; ld.u8 %rs388, [%rd1319+88]; and.b16 %rs389, %rs388, 1; setp.eq.b16 %p792, %rs389, 1; mov.pred %p2921, 0; xor.pred %p793, %p792, %p2921; not.pred %p794, %p793; @%p794 bra $L__BB1_551; ld.v4.u32 {%r2420, %r2421, %r2422, %r2423}, [%rd1319+64]; cvt.u64.u32 %rd6720, %r2420; setp.gt.u64 %p796, %rd1310, %rd6720; mul.wide.u32 %rd6721, %r2420, 12; add.s64 %rd6722, %rd1311, %rd6721; selp.b64 %rd6723, %rd6722, 0, %p796; setp.eq.s64 %p797, %rd6723, 0; add.s64 %rd6724, %rd6723, 8; selp.b64 %rd11804, 0, %rd6724, %p797; cvt.u64.u32 %rd6725, %r2421; setp.gt.u64 %p798, %rd1310, %rd6725; mul.wide.u32 %rd6726, %r2421, 12; add.s64 %rd6727, %rd1311, %rd6726; selp.b64 %rd6728, %rd6727, 0, %p798; setp.eq.s64 %p799, %rd6728, 0; add.s64 %rd6729, %rd6728, 8; selp.b64 %rd11803, 0, %rd6729, %p799; ld.u32 %r2427, [%rd1319+72]; cvt.u64.u32 %rd6730, %r2427; setp.gt.u64 %p800, %rd1310, %rd6730; mul.wide.u32 %rd6731, %r2427, 12; add.s64 %rd6732, %rd1311, %rd6731; selp.b64 %rd6733, %rd6732, 0, %p800; setp.eq.s64 %p801, %rd6733, 0; add.s64 %rd6734, %rd6733, 8; selp.b64 %rd11802, 0, %rd6734, %p801; cvt.u64.u32 %rd6735, %r2423; setp.gt.u64 %p802, %rd1310, %rd6735; mul.wide.u32 %rd6736, %r2423, 12; add.s64 %rd6737, %rd1311, %rd6736; selp.b64 %rd6738, %rd6737, 0, %p802; setp.eq.s64 %p803, %rd6738, 0; add.s64 %rd6739, %rd6738, 8; selp.b64 %rd11801, 0, %rd6739, %p803; mov.pred %p2921, -1; $L__BB1_551: ld.v4.f32 {%f2489, %f2490, %f2491, %f2492}, [%rd1319]; sub.f32 %f2497, %f2489, %f389; sub.f32 %f2498, %f2490, %f389; sub.f32 %f2499, %f2491, %f389; sub.f32 %f2500, %f2492, %f389; ld.v4.f32 {%f2501, %f2502, %f2503, %f2504}, [%rd1319+16]; sub.f32 %f2509, %f2501, %f390; sub.f32 %f2510, %f2502, %f390; sub.f32 %f2511, %f2503, %f390; sub.f32 %f2512, %f2504, %f390; ld.v4.f32 {%f2513, %f2514, %f2515, %f2516}, [%rd1319+32]; sub.f32 %f2521, %f389, %f2513; sub.f32 %f2522, %f389, %f2514; sub.f32 %f2523, %f389, %f2515; sub.f32 %f2524, %f389, %f2516; ld.v4.f32 {%f2525, %f2526, %f2527, %f2528}, [%rd1319+48]; sub.f32 %f2533, %f390, %f2525; sub.f32 %f2534, %f390, %f2526; sub.f32 %f2535, %f390, %f2527; sub.f32 %f2536, %f390, %f2528; setp.ge.f32 %p804, %f2497, %f2521; selp.f32 %f2537, %f2497, %f2521, %p804; setp.ge.f32 %p805, %f2498, %f2522; selp.f32 %f2538, %f2498, %f2522, %p805; setp.ge.f32 %p806, %f2499, %f2523; selp.f32 %f2539, %f2499, %f2523, %p806; setp.ge.f32 %p807, %f2500, %f2524; selp.f32 %f2540, %f2500, %f2524, %p807; setp.ge.f32 %p808, %f2509, %f2533; selp.f32 %f2541, %f2509, %f2533, %p808; setp.ge.f32 %p809, %f2510, %f2534; selp.f32 %f2542, %f2510, %f2534, %p809; setp.ge.f32 %p810, %f2511, %f2535; selp.f32 %f2543, %f2511, %f2535, %p810; setp.ge.f32 %p811, %f2512, %f2536; selp.f32 %f2544, %f2512, %f2536, %p811; setp.ge.f32 %p812, %f2537, 0f00000000; selp.f32 %f2545, %f2537, 0f00000000, %p812; setp.ge.f32 %p813, %f2538, 0f00000000; selp.f32 %f2546, %f2538, 0f00000000, %p813; setp.ge.f32 %p814, %f2539, 0f00000000; selp.f32 %f2547, %f2539, 0f00000000, %p814; setp.ge.f32 %p815, %f2540, 0f00000000; selp.f32 %f2548, %f2540, 0f00000000, %p815; mov.b32 %r2428, %f2545; mov.b32 %r2429, %f2546; mov.b32 %r2430, %f2547; mov.b32 %r2431, %f2548; cvt.u64.u32 %rd6740, %r2431; cvt.u64.u32 %rd6741, %r2429; cvt.u64.u32 %rd6742, %r2428; cvt.u64.u32 %rd6743, %r2430; bfi.b64 %rd6744, %rd6740, %rd6743, 32, 32; bfi.b64 %rd6745, %rd6741, %rd6742, 32, 32; setp.ge.f32 %p816, %f2541, 0f00000000; selp.f32 %f2549, %f2541, 0f00000000, %p816; setp.ge.f32 %p817, %f2542, 0f00000000; selp.f32 %f2550, %f2542, 0f00000000, %p817; setp.ge.f32 %p818, %f2543, 0f00000000; selp.f32 %f2551, %f2543, 0f00000000, %p818; setp.ge.f32 %p819, %f2544, 0f00000000; selp.f32 %f2552, %f2544, 0f00000000, %p819; mov.b32 %r2432, %f2549; mov.b32 %r2433, %f2550; mov.b32 %r2434, %f2551; mov.b32 %r2435, %f2552; cvt.u64.u32 %rd6746, %r2435; cvt.u64.u32 %rd6747, %r2433; cvt.u64.u32 %rd6748, %r2432; cvt.u64.u32 %rd6749, %r2434; bfi.b64 %rd6750, %rd6746, %rd6749, 32, 32; bfi.b64 %rd6751, %rd6747, %rd6748, 32, 32; mov.b64 {%r2436, %r2437}, %rd6745; mov.b64 {%r2438, %r2439}, %rd6744; cvt.u64.u32 %rd6752, %r2439; cvt.u64.u32 %rd6753, %r2437; cvt.u64.u32 %rd6754, %r2438; bfi.b64 %rd6755, %rd6752, %rd6754, 32, 32; mov.b64 {%r2440, %r2441}, %rd6755; bfi.b64 %rd6756, %rd6753, %rd6742, 32, 32; mov.b64 {%r2442, %r2443}, %rd6756; mov.b32 %f2553, %r2442; mov.b32 %f2554, %r2443; mov.b32 %f2555, %r2440; mov.b32 %f2556, %r2441; mov.b32 %f2557, %r2436; mov.b32 %f2558, %r2437; mov.b32 %f2559, %r2438; mov.b32 %f2560, %r2439; mov.b64 {%r2444, %r2445}, %rd6751; mov.b64 {%r2446, %r2447}, %rd6750; cvt.u64.u32 %rd6757, %r2447; cvt.u64.u32 %rd6758, %r2445; cvt.u64.u32 %rd6759, %r2446; bfi.b64 %rd6760, %rd6757, %rd6759, 32, 32; mov.b64 {%r2448, %r2449}, %rd6760; bfi.b64 %rd6761, %rd6758, %rd6748, 32, 32; mov.b64 {%r2450, %r2451}, %rd6761; mov.b32 %f2561, %r2450; mov.b32 %f2562, %r2451; mov.b32 %f2563, %r2448; mov.b32 %f2564, %r2449; mov.b32 %f2565, %r2444; mov.b32 %f2566, %r2445; mov.b32 %f2567, %r2446; mov.b32 %f2568, %r2447; mul.f32 %f2569, %f2565, %f2561; mul.f32 %f2570, %f2566, %f2562; mul.f32 %f2571, %f2567, %f2563; mul.f32 %f2572, %f2568, %f2564; fma.rn.f32 %f2573, %f2557, %f2553, %f2569; fma.rn.f32 %f2574, %f2558, %f2554, %f2570; fma.rn.f32 %f2575, %f2559, %f2555, %f2571; fma.rn.f32 %f2576, %f2560, %f2556, %f2572; add.f32 %f2577, %f2573, 0f00000000; add.f32 %f2578, %f2574, 0f00000000; add.f32 %f2579, %f2575, 0f00000000; add.f32 %f2580, %f2576, 0f00000000; sqrt.rn.f32 %f2581, %f2577; sqrt.rn.f32 %f2582, %f2578; sqrt.rn.f32 %f2583, %f2579; sqrt.rn.f32 %f2584, %f2580; mov.b32 %r2452, %f2581; mov.b32 %r2453, %f2582; mov.b32 %r2454, %f2583; mov.b32 %r2455, %f2584; cvt.u64.u32 %rd6762, %r2455; cvt.u64.u32 %rd6763, %r2453; cvt.u64.u32 %rd6764, %r2452; cvt.u64.u32 %rd6765, %r2454; bfi.b64 %rd11910, %rd6762, %rd6765, 32, 32; mov.b64 {%r2456, %r2457}, %rd11910; bfi.b64 %rd11909, %rd6763, %rd6764, 32, 32; mov.b64 {%r2458, %r2459}, %rd11909; mov.b32 %f2585, %r2458; mov.b32 %f2586, %r2459; mov.b32 %f2587, %r2456; mov.b32 %f2588, %r2457; setp.lt.f32 %p820, %f2585, %f391; setp.lt.f32 %p821, %f2586, %f391; setp.lt.f32 %p822, %f2587, %f391; setp.lt.f32 %p823, %f2588, %f391; selp.u32 %r2460, 1, 0, %p820; selp.u32 %r2461, -1, 0, %p821; bfi.b32 %r2462, %r2461, %r2460, 8, 1; selp.u32 %r2463, -1, 0, %p822; bfi.b32 %r2464, %r2463, %r2462, 16, 1; selp.u32 %r2465, -1, 0, %p823; bfi.b32 %r2466, %r2465, %r2464, 24, 1; cvt.u64.u32 %rd6766, %r2466; mov.b64 {%r2467, %r2468}, %rd6766; mov.b32 {%rs390, %rs391}, %r2467; and.b16 %rs392, %rs390, 1; shr.u16 %rs393, %rs390, 7; and.b16 %rs394, %rs393, 2; or.b16 %rs395, %rs394, %rs392; shl.b16 %rs396, %rs391, 2; and.b16 %rs397, %rs396, 4; or.b16 %rs398, %rs395, %rs397; shr.u16 %rs399, %rs391, 5; and.b16 %rs400, %rs399, 8; or.b16 %rs401, %rs398, %rs400; cvt.u64.u16 %rd1330, %rs401; @%p2921 bra $L__BB1_553; bra.uni $L__BB1_552; $L__BB1_553: mov.u64 %rd6767, 1; st.local.v2.u64 [%rd8], {%rd11804, %rd11803}; st.local.v2.u64 [%rd8+16], {%rd11802, %rd11801}; mov.f32 %f2589, 0f00000000; st.local.v4.f32 [%rd24], {%f2589, %f2589, %f2589, %f2589}; mov.u32 %r2479, 4; st.local.u32 [%rd7+16], %r2479; st.local.u32 [%rd7+52], %r2479; st.local.u32 [%rd7+88], %r2479; st.local.u32 [%rd7+124], %r2479; mov.u64 %rd1334, %rd6767; $L__BB1_554: add.s64 %rd6771, %rd1334, -1; cvt.u32.u64 %r2480, %rd6771; shl.b64 %rd6773, %rd6767, %r2480; and.b64 %rd6774, %rd6773, %rd1330; setp.eq.s64 %p824, %rd6774, 0; @%p824 bra $L__BB1_607; shl.b64 %rd6775, %rd1334, 3; add.s64 %rd6776, %rd8, %rd6775; ld.local.u64 %rd1335, [%rd6776+-8]; setp.eq.s64 %p825, %rd1335, 0; @%p825 bra $L__BB1_607; ld.u32 %r406, [%rd1335]; cvt.u64.u32 %rd1336, %r406; ld.global.u64 %rd6777, [%rd1182+112]; setp.gt.u64 %p826, %rd6777, %rd1336; @%p826 bra $L__BB1_558; bra.uni $L__BB1_557; $L__BB1_558: ld.global.u64 %rd6778, [%rd1182+104]; mul.lo.s64 %rd6779, %rd1336, 12; add.s64 %rd1337, %rd6778, %rd6779; ld.u32 %rd1338, [%rd1337+8]; ld.u32 %rd1339, [%rd1337]; ld.global.u64 %rd1340, [%rd1182+96]; setp.gt.u64 %p827, %rd1340, %rd1339; @%p827 bra $L__BB1_560; bra.uni $L__BB1_559; $L__BB1_560: ld.global.u64 %rd1341, [%rd1182+88]; shl.b64 %rd6780, %rd1339, 3; add.s64 %rd6781, %rd1341, %rd6780; ld.u32 %rd6782, [%rd6781]; ld.u32 %rd6783, [%rd6781+4]; bfi.b64 %rd1342, %rd6783, %rd6782, 32, 32; ld.u32 %rd1343, [%rd1337+4]; setp.gt.u64 %p828, %rd1340, %rd1343; @%p828 bra $L__BB1_562; bra.uni $L__BB1_561; $L__BB1_562: setp.gt.u64 %p829, %rd1340, %rd1338; @%p829 bra $L__BB1_564; bra.uni $L__BB1_563; $L__BB1_564: shl.b64 %rd6784, %rd1343, 3; add.s64 %rd6785, %rd1341, %rd6784; shl.b64 %rd6786, %rd1338, 3; add.s64 %rd6787, %rd1341, %rd6786; cvt.u32.u64 %r2481, %rd1342; mov.b32 %f392, %r2481; shr.u64 %rd6788, %rd1342, 32; cvt.u32.u64 %r2482, %rd6788; mov.b32 %f393, %r2482; ld.u32 %rd6789, [%rd6785]; ld.u32 %rd6790, [%rd6785+4]; bfi.b64 %rd1344, %rd6790, %rd6789, 32, 32; cvt.u32.u64 %r2483, %rd1344; shr.u64 %rd6791, %rd1344, 32; cvt.u32.u64 %r2484, %rd6791; mov.b32 %f394, %r2483; sub.f32 %f395, %f394, %f392; mov.b32 %f5359, %r2484; sub.f32 %f397, %f5359, %f393; ld.u32 %rd6792, [%rd6787]; ld.u32 %rd6793, [%rd6787+4]; bfi.b64 %rd1345, %rd6793, %rd6792, 32, 32; cvt.u32.u64 %r2485, %rd1345; shr.u64 %rd6794, %rd1345, 32; cvt.u32.u64 %r2486, %rd6794; mov.b32 %f398, %r2485; sub.f32 %f399, %f398, %f392; mov.b32 %f400, %r2486; sub.f32 %f401, %f400, %f393; sub.f32 %f402, %f389, %f392; sub.f32 %f403, %f390, %f393; mul.f32 %f2590, %f397, %f403; fma.rn.f32 %f404, %f395, %f402, %f2590; mul.f32 %f2591, %f401, %f403; fma.rn.f32 %f405, %f399, %f402, %f2591; setp.le.f32 %p830, %f404, 0f00000000; setp.le.f32 %p831, %f405, 0f00000000; and.pred %p832, %p830, %p831; @%p832 bra $L__BB1_602; bra.uni $L__BB1_565; $L__BB1_602: add.u64 %rd11895, %SP, 552; cvta.to.local.u64 %rd11893, %rd11895; add.u64 %rd11901, %SP, 0; cvta.to.local.u64 %rd11899, %rd11901; st.local.u64 [%rd11899], %rd1342; mov.u64 %rd11906, 2; mov.u64 %rd11892, %rd1164; mov.u64 %rd11894, %rd11893; mov.u64 %rd11896, %rd11893; mov.u64 %rd11897, %rd11893; mov.u64 %rd11898, %rd11895; mov.u64 %rd11900, %rd11899; mov.u64 %rd11902, %rd11899; mov.u64 %rd11903, %rd11899; mov.u64 %rd11904, %rd11901; mov.u64 %rd11905, %rd1158; $L__BB1_603: setp.eq.s64 %p885, %rd11906, 0; mov.u64 %rd11907, 1; @%p885 bra $L__BB1_605; add.s64 %rd11906, %rd11906, -1; add.s64 %rd6939, %rd11893, 8; setp.eq.s64 %p886, %rd11896, %rd11892; selp.b64 %rd6940, %rd6939, %rd11896, %p886; add.s64 %rd6941, %rd11894, 8; selp.b64 %rd6942, %rd6941, %rd11897, %p886; add.s64 %rd6943, %rd11895, 8; selp.b64 %rd6944, %rd6943, %rd11898, %p886; mov.u64 %rd11907, 0; setp.eq.s64 %p887, %rd11906, 0; add.s64 %rd6945, %rd6940, 4; add.s64 %rd6946, %rd6942, 4; add.s64 %rd6947, %rd6944, 4; selp.b64 %rd1571, %rd6940, %rd6945, %p887; selp.b64 %rd11897, %rd6942, %rd6946, %p887; selp.b64 %rd11898, %rd6944, %rd6947, %p887; selp.b64 %rd11893, %rd6939, %rd11893, %p886; selp.b64 %rd11894, %rd6941, %rd11894, %p886; selp.b64 %rd11895, %rd6943, %rd11895, %p886; add.s64 %rd6948, %rd11896, 8; selp.b64 %rd11892, %rd6948, %rd11892, %p886; add.s64 %rd6949, %rd11902, 8; setp.eq.s64 %p888, %rd11899, %rd11905; selp.b64 %rd6950, %rd6949, %rd11899, %p888; add.s64 %rd6951, %rd11903, 8; selp.b64 %rd6952, %rd6951, %rd11900, %p888; add.s64 %rd6953, %rd11904, 8; selp.b64 %rd6954, %rd6953, %rd11901, %p888; selp.b64 %rd11902, %rd6949, %rd11902, %p888; selp.b64 %rd11903, %rd6951, %rd11903, %p888; selp.b64 %rd11904, %rd6953, %rd11904, %p888; add.s64 %rd6955, %rd11899, 8; selp.b64 %rd11905, %rd6955, %rd11905, %p888; add.s64 %rd6956, %rd6950, 4; add.s64 %rd6957, %rd6952, 4; add.s64 %rd6958, %rd6954, 4; selp.b64 %rd11899, %rd6950, %rd6956, %p887; selp.b64 %rd11900, %rd6952, %rd6957, %p887; selp.b64 %rd11901, %rd6954, %rd6958, %p887; ld.local.f32 %f2657, [%rd6952]; ld.local.f32 %f2658, [%rd6942]; setp.eq.f32 %p889, %f2658, %f2657; mov.u64 %rd11896, %rd1571; @%p889 bra $L__BB1_603; $L__BB1_605: mov.u64 %rd11081, 0; or.b64 %rd6960, %rd11081, %rd1342; mov.b64 {%r2528, %r2529}, %rd6960; mov.b64 {%r2530, %r2531}, %rd11907; cvt.u32.u64 %r2533, %rd11081; or.b32 %r4943, %r2533, %r2481; mov.u32 %r4944, 0; mov.b32 %f5363, %r2529; mov.b32 {%rs1023, %rs420}, %r2530; mov.u32 %r4945, %r4944; bra.uni $L__BB1_606; $L__BB1_565: sub.f32 %f406, %f389, %f394; sub.f32 %f407, %f390, %f5359; mul.f32 %f2592, %f397, %f407; fma.rn.f32 %f408, %f395, %f406, %f2592; mul.f32 %f2593, %f401, %f407; fma.rn.f32 %f409, %f399, %f406, %f2593; setp.ge.f32 %p833, %f408, 0f00000000; setp.le.f32 %p834, %f409, %f408; and.pred %p835, %p834, %p833; @%p835 bra $L__BB1_598; bra.uni $L__BB1_566; $L__BB1_598: add.u64 %rd11879, %SP, 552; cvta.to.local.u64 %rd11877, %rd11879; add.u64 %rd11885, %SP, 0; cvta.to.local.u64 %rd11883, %rd11885; st.local.u64 [%rd11883], %rd1344; mov.u64 %rd11890, 2; mov.u64 %rd11876, %rd1164; mov.u64 %rd11878, %rd11877; mov.u64 %rd11880, %rd11877; mov.u64 %rd11881, %rd11877; mov.u64 %rd11882, %rd11879; mov.u64 %rd11884, %rd11883; mov.u64 %rd11886, %rd11883; mov.u64 %rd11887, %rd11883; mov.u64 %rd11888, %rd11885; mov.u64 %rd11889, %rd1159; $L__BB1_599: setp.eq.s64 %p880, %rd11890, 0; mov.u64 %rd11891, 1; @%p880 bra $L__BB1_601; add.s64 %rd11890, %rd11890, -1; add.s64 %rd6912, %rd11877, 8; setp.eq.s64 %p881, %rd11880, %rd11876; selp.b64 %rd6913, %rd6912, %rd11880, %p881; add.s64 %rd6914, %rd11878, 8; selp.b64 %rd6915, %rd6914, %rd11881, %p881; add.s64 %rd6916, %rd11879, 8; selp.b64 %rd6917, %rd6916, %rd11882, %p881; mov.u64 %rd11891, 0; setp.eq.s64 %p882, %rd11890, 0; add.s64 %rd6918, %rd6913, 4; add.s64 %rd6919, %rd6915, 4; add.s64 %rd6920, %rd6917, 4; selp.b64 %rd1533, %rd6913, %rd6918, %p882; selp.b64 %rd11881, %rd6915, %rd6919, %p882; selp.b64 %rd11882, %rd6917, %rd6920, %p882; selp.b64 %rd11877, %rd6912, %rd11877, %p881; selp.b64 %rd11878, %rd6914, %rd11878, %p881; selp.b64 %rd11879, %rd6916, %rd11879, %p881; add.s64 %rd6921, %rd11880, 8; selp.b64 %rd11876, %rd6921, %rd11876, %p881; add.s64 %rd6922, %rd11886, 8; setp.eq.s64 %p883, %rd11883, %rd11889; selp.b64 %rd6923, %rd6922, %rd11883, %p883; add.s64 %rd6924, %rd11887, 8; selp.b64 %rd6925, %rd6924, %rd11884, %p883; add.s64 %rd6926, %rd11888, 8; selp.b64 %rd6927, %rd6926, %rd11885, %p883; selp.b64 %rd11886, %rd6922, %rd11886, %p883; selp.b64 %rd11887, %rd6924, %rd11887, %p883; selp.b64 %rd11888, %rd6926, %rd11888, %p883; add.s64 %rd6928, %rd11883, 8; selp.b64 %rd11889, %rd6928, %rd11889, %p883; add.s64 %rd6929, %rd6923, 4; add.s64 %rd6930, %rd6925, 4; add.s64 %rd6931, %rd6927, 4; selp.b64 %rd11883, %rd6923, %rd6929, %p882; selp.b64 %rd11884, %rd6925, %rd6930, %p882; selp.b64 %rd11885, %rd6927, %rd6931, %p882; ld.local.f32 %f2655, [%rd6925]; ld.local.f32 %f2656, [%rd6915]; setp.eq.f32 %p884, %f2656, %f2655; mov.u64 %rd11880, %rd1533; @%p884 bra $L__BB1_599; $L__BB1_601: mov.u64 %rd11080, 0; or.b64 %rd6933, %rd11080, %rd1344; mov.b64 {%r2520, %r2521}, %rd6933; mov.b64 {%r2522, %r2523}, %rd11891; cvt.u32.u64 %r2525, %rd11080; or.b32 %r4943, %r2525, %r2483; mov.u32 %r4944, 0; mov.b32 %f5363, %r2521; mov.u32 %r4945, 1; mov.b32 {%rs1023, %rs416}, %r2522; bra.uni $L__BB1_606; $L__BB1_566: sub.f32 %f410, %f389, %f398; sub.f32 %f411, %f390, %f400; mul.f32 %f2594, %f397, %f411; fma.rn.f32 %f412, %f395, %f410, %f2594; mul.f32 %f2595, %f401, %f411; fma.rn.f32 %f413, %f399, %f410, %f2595; setp.ge.f32 %p836, %f413, 0f00000000; setp.le.f32 %p837, %f412, %f413; and.pred %p838, %p837, %p836; @%p838 bra $L__BB1_594; bra.uni $L__BB1_567; $L__BB1_594: add.u64 %rd11863, %SP, 552; cvta.to.local.u64 %rd11861, %rd11863; add.u64 %rd11869, %SP, 0; cvta.to.local.u64 %rd11867, %rd11869; st.local.u64 [%rd11867], %rd1345; mov.u64 %rd11874, 2; mov.u64 %rd11860, %rd1164; mov.u64 %rd11862, %rd11861; mov.u64 %rd11864, %rd11861; mov.u64 %rd11865, %rd11861; mov.u64 %rd11866, %rd11863; mov.u64 %rd11868, %rd11867; mov.u64 %rd11870, %rd11867; mov.u64 %rd11871, %rd11867; mov.u64 %rd11872, %rd11869; mov.u64 %rd11873, %rd1160; $L__BB1_595: setp.eq.s64 %p875, %rd11874, 0; mov.u64 %rd11875, 1; @%p875 bra $L__BB1_597; add.s64 %rd11874, %rd11874, -1; add.s64 %rd6885, %rd11861, 8; setp.eq.s64 %p876, %rd11864, %rd11860; selp.b64 %rd6886, %rd6885, %rd11864, %p876; add.s64 %rd6887, %rd11862, 8; selp.b64 %rd6888, %rd6887, %rd11865, %p876; add.s64 %rd6889, %rd11863, 8; selp.b64 %rd6890, %rd6889, %rd11866, %p876; mov.u64 %rd11875, 0; setp.eq.s64 %p877, %rd11874, 0; add.s64 %rd6891, %rd6886, 4; add.s64 %rd6892, %rd6888, 4; add.s64 %rd6893, %rd6890, 4; selp.b64 %rd1495, %rd6886, %rd6891, %p877; selp.b64 %rd11865, %rd6888, %rd6892, %p877; selp.b64 %rd11866, %rd6890, %rd6893, %p877; selp.b64 %rd11861, %rd6885, %rd11861, %p876; selp.b64 %rd11862, %rd6887, %rd11862, %p876; selp.b64 %rd11863, %rd6889, %rd11863, %p876; add.s64 %rd6894, %rd11864, 8; selp.b64 %rd11860, %rd6894, %rd11860, %p876; add.s64 %rd6895, %rd11870, 8; setp.eq.s64 %p878, %rd11867, %rd11873; selp.b64 %rd6896, %rd6895, %rd11867, %p878; add.s64 %rd6897, %rd11871, 8; selp.b64 %rd6898, %rd6897, %rd11868, %p878; add.s64 %rd6899, %rd11872, 8; selp.b64 %rd6900, %rd6899, %rd11869, %p878; selp.b64 %rd11870, %rd6895, %rd11870, %p878; selp.b64 %rd11871, %rd6897, %rd11871, %p878; selp.b64 %rd11872, %rd6899, %rd11872, %p878; add.s64 %rd6901, %rd11867, 8; selp.b64 %rd11873, %rd6901, %rd11873, %p878; add.s64 %rd6902, %rd6896, 4; add.s64 %rd6903, %rd6898, 4; add.s64 %rd6904, %rd6900, 4; selp.b64 %rd11867, %rd6896, %rd6902, %p877; selp.b64 %rd11868, %rd6898, %rd6903, %p877; selp.b64 %rd11869, %rd6900, %rd6904, %p877; ld.local.f32 %f2653, [%rd6898]; ld.local.f32 %f2654, [%rd6888]; setp.eq.f32 %p879, %f2654, %f2653; mov.u64 %rd11864, %rd1495; @%p879 bra $L__BB1_595; $L__BB1_597: mov.u64 %rd11079, 0; or.b64 %rd6906, %rd11079, %rd1345; mov.b64 {%r2512, %r2513}, %rd6906; mov.b64 {%r2514, %r2515}, %rd11875; cvt.u32.u64 %r2517, %rd11079; or.b32 %r4943, %r2517, %r2485; mov.u32 %r4944, 0; mov.b32 %f5363, %r2513; mov.b32 {%rs1023, %rs412}, %r2514; mov.u32 %r4945, 2; bra.uni $L__BB1_606; $L__BB1_567: sub.f32 %f414, %f398, %f394; sub.f32 %f415, %f400, %f5359; mul.f32 %f2596, %f397, %f399; mul.f32 %f2597, %f395, %f401; sub.f32 %f416, %f2597, %f2596; mul.f32 %f2598, %f397, %f402; mul.f32 %f2599, %f395, %f403; sub.f32 %f2600, %f2599, %f2598; mul.f32 %f2601, %f416, %f2600; setp.lt.f32 %p839, %f2601, 0f00000000; setp.ge.f32 %p840, %f404, 0f00000000; and.pred %p841, %p840, %p839; setp.le.f32 %p842, %f408, 0f00000000; and.pred %p843, %p842, %p841; mov.u16 %rs1022, 0; @%p843 bra $L__BB1_570; mul.f32 %f2602, %f399, %f411; mul.f32 %f2603, %f410, %f401; sub.f32 %f2604, %f2602, %f2603; mul.f32 %f2605, %f416, %f2604; setp.gt.f32 %p844, %f2605, 0f80000000; setp.ge.f32 %p845, %f405, 0f00000000; and.pred %p846, %p845, %p844; setp.le.f32 %p847, %f413, 0f00000000; and.pred %p848, %p847, %p846; mov.u16 %rs1022, 1; @%p848 bra $L__BB1_570; mul.f32 %f2606, %f414, %f407; mul.f32 %f2607, %f406, %f415; sub.f32 %f2608, %f2606, %f2607; mul.f32 %f2609, %f416, %f2608; setp.lt.f32 %p849, %f2609, 0f00000000; sub.f32 %f2610, %f409, %f408; setp.ge.f32 %p850, %f2610, 0f00000000; and.pred %p851, %p850, %p849; sub.f32 %f2611, %f412, %f413; setp.ge.f32 %p852, %f2611, 0f00000000; and.pred %p853, %p852, %p851; selp.b16 %rs1022, 2, 3, %p853; $L__BB1_570: mul.f32 %f2612, %f397, %f397; fma.rn.f32 %f2613, %f395, %f395, %f2612; add.f32 %f417, %f2613, 0f00000000; mul.f32 %f2614, %f401, %f401; fma.rn.f32 %f2615, %f399, %f399, %f2614; add.f32 %f418, %f2615, 0f00000000; mul.f32 %f2616, %f415, %f415; fma.rn.f32 %f2617, %f414, %f414, %f2616; add.f32 %f419, %f2617, 0f00000000; setp.eq.s16 %p854, %rs1022, 1; @%p854 bra $L__BB1_585; setp.eq.s16 %p855, %rs1022, 2; @%p855 bra $L__BB1_581; setp.ne.s16 %p856, %rs1022, 3; @%p856 bra $L__BB1_589; sub.f32 %f2618, %f404, %f408; div.rn.f32 %f420, %f404, %f2618; sub.f32 %f2619, %f405, %f413; div.rn.f32 %f421, %f405, %f2619; sub.f32 %f2620, %f409, %f408; add.f32 %f2621, %f412, %f2620; sub.f32 %f2622, %f2621, %f413; div.rn.f32 %f5361, %f2620, %f2622; mul.f32 %f2623, %f403, %f403; fma.rn.f32 %f2624, %f402, %f402, %f2623; add.f32 %f2625, %f2624, 0f00000000; mul.f32 %f2626, %f417, %f420; mul.f32 %f2627, %f420, %f2626; sub.f32 %f423, %f2625, %f2627; mul.f32 %f2628, %f418, %f5361; mul.f32 %f2629, %f5361, %f2628; sub.f32 %f424, %f2625, %f2629; mul.f32 %f2630, %f407, %f407; fma.rn.f32 %f2631, %f406, %f406, %f2630; add.f32 %f2632, %f2631, 0f00000000; mul.f32 %f2633, %f419, %f421; mul.f32 %f2634, %f421, %f2633; sub.f32 %f425, %f2632, %f2634; setp.lt.f32 %p857, %f423, %f424; @%p857 bra $L__BB1_577; bra.uni $L__BB1_574; $L__BB1_577: setp.lt.f32 %p859, %f423, %f425; @%p859 bra $L__BB1_579; bra.uni $L__BB1_578; $L__BB1_579: mul.f32 %f5360, %f397, %f420; fma.rn.f32 %f5358, %f395, %f420, %f392; mov.u32 %r4945, 0; mov.f32 %f5359, %f393; mov.f32 %f5361, %f420; bra.uni $L__BB1_580; $L__BB1_581: add.u64 %rd11813, %SP, 552; cvta.to.local.u64 %rd11811, %rd11813; add.u64 %rd11819, %SP, 0; cvta.to.local.u64 %rd11817, %rd11819; mul.f32 %f2637, %f415, %f407; fma.rn.f32 %f2638, %f414, %f406, %f2637; div.rn.f32 %f5362, %f2638, %f419; fma.rn.f32 %f2639, %f414, %f5362, %f394; mov.b32 %r2494, %f2639; fma.rn.f32 %f2640, %f415, %f5362, %f5359; mov.b32 %r2495, %f2640; cvt.u64.u32 %rd6798, %r2495; cvt.u64.u32 %rd6799, %r2494; bfi.b64 %rd1353, %rd6798, %rd6799, 32, 32; st.local.u64 [%rd11817], %rd1353; mov.u64 %rd11824, 2; mov.u64 %rd11810, %rd1164; mov.u64 %rd11812, %rd11811; mov.u64 %rd11814, %rd11811; mov.u64 %rd11815, %rd11811; mov.u64 %rd11816, %rd11813; mov.u64 %rd11818, %rd11817; mov.u64 %rd11820, %rd11817; mov.u64 %rd11821, %rd11817; mov.u64 %rd11822, %rd11819; mov.u64 %rd11823, %rd1163; $L__BB1_582: setp.eq.s64 %p860, %rd11824, 0; mov.u64 %rd11859, 1; @%p860 bra $L__BB1_584; add.s64 %rd11824, %rd11824, -1; add.s64 %rd6804, %rd11811, 8; setp.eq.s64 %p861, %rd11814, %rd11810; selp.b64 %rd6805, %rd6804, %rd11814, %p861; add.s64 %rd6806, %rd11812, 8; selp.b64 %rd6807, %rd6806, %rd11815, %p861; add.s64 %rd6808, %rd11813, 8; selp.b64 %rd6809, %rd6808, %rd11816, %p861; mov.u64 %rd11859, 0; setp.eq.s64 %p862, %rd11824, 0; add.s64 %rd6810, %rd6805, 4; add.s64 %rd6811, %rd6807, 4; add.s64 %rd6812, %rd6809, 4; selp.b64 %rd1370, %rd6805, %rd6810, %p862; selp.b64 %rd11815, %rd6807, %rd6811, %p862; selp.b64 %rd11816, %rd6809, %rd6812, %p862; selp.b64 %rd11811, %rd6804, %rd11811, %p861; selp.b64 %rd11812, %rd6806, %rd11812, %p861; selp.b64 %rd11813, %rd6808, %rd11813, %p861; add.s64 %rd6813, %rd11814, 8; selp.b64 %rd11810, %rd6813, %rd11810, %p861; add.s64 %rd6814, %rd11820, 8; setp.eq.s64 %p863, %rd11817, %rd11823; selp.b64 %rd6815, %rd6814, %rd11817, %p863; add.s64 %rd6816, %rd11821, 8; selp.b64 %rd6817, %rd6816, %rd11818, %p863; add.s64 %rd6818, %rd11822, 8; selp.b64 %rd6819, %rd6818, %rd11819, %p863; selp.b64 %rd11820, %rd6814, %rd11820, %p863; selp.b64 %rd11821, %rd6816, %rd11821, %p863; selp.b64 %rd11822, %rd6818, %rd11822, %p863; add.s64 %rd6820, %rd11817, 8; selp.b64 %rd11823, %rd6820, %rd11823, %p863; add.s64 %rd6821, %rd6815, 4; add.s64 %rd6822, %rd6817, 4; add.s64 %rd6823, %rd6819, 4; selp.b64 %rd11817, %rd6815, %rd6821, %p862; selp.b64 %rd11818, %rd6817, %rd6822, %p862; selp.b64 %rd11819, %rd6819, %rd6823, %p862; ld.local.f32 %f2641, [%rd6817]; ld.local.f32 %f2642, [%rd6807]; setp.eq.f32 %p864, %f2642, %f2641; mov.u64 %rd11814, %rd1370; @%p864 bra $L__BB1_582; $L__BB1_584: mov.u64 %rd11076, 0; or.b64 %rd11858, %rd11076, %rd1353; mov.u32 %r4945, 1; bra.uni $L__BB1_593; $L__BB1_585: add.u64 %rd11829, %SP, 552; cvta.to.local.u64 %rd11827, %rd11829; add.u64 %rd11835, %SP, 0; cvta.to.local.u64 %rd11833, %rd11835; div.rn.f32 %f5362, %f405, %f418; fma.rn.f32 %f2643, %f399, %f5362, %f392; mov.b32 %r2497, %f2643; fma.rn.f32 %f2644, %f401, %f5362, %f393; mov.b32 %r2498, %f2644; cvt.u64.u32 %rd6825, %r2498; cvt.u64.u32 %rd6826, %r2497; bfi.b64 %rd1394, %rd6825, %rd6826, 32, 32; st.local.u64 [%rd11833], %rd1394; mov.u64 %rd11840, 2; mov.u64 %rd11826, %rd1164; mov.u64 %rd11828, %rd11827; mov.u64 %rd11830, %rd11827; mov.u64 %rd11831, %rd11827; mov.u64 %rd11832, %rd11829; mov.u64 %rd11834, %rd11833; mov.u64 %rd11836, %rd11833; mov.u64 %rd11837, %rd11833; mov.u64 %rd11838, %rd11835; mov.u64 %rd11839, %rd1162; $L__BB1_586: setp.eq.s64 %p865, %rd11840, 0; mov.u64 %rd11859, 1; @%p865 bra $L__BB1_588; add.s64 %rd11840, %rd11840, -1; add.s64 %rd6831, %rd11827, 8; setp.eq.s64 %p866, %rd11830, %rd11826; selp.b64 %rd6832, %rd6831, %rd11830, %p866; add.s64 %rd6833, %rd11828, 8; selp.b64 %rd6834, %rd6833, %rd11831, %p866; add.s64 %rd6835, %rd11829, 8; selp.b64 %rd6836, %rd6835, %rd11832, %p866; mov.u64 %rd11859, 0; setp.eq.s64 %p867, %rd11840, 0; add.s64 %rd6837, %rd6832, 4; add.s64 %rd6838, %rd6834, 4; add.s64 %rd6839, %rd6836, 4; selp.b64 %rd1411, %rd6832, %rd6837, %p867; selp.b64 %rd11831, %rd6834, %rd6838, %p867; selp.b64 %rd11832, %rd6836, %rd6839, %p867; selp.b64 %rd11827, %rd6831, %rd11827, %p866; selp.b64 %rd11828, %rd6833, %rd11828, %p866; selp.b64 %rd11829, %rd6835, %rd11829, %p866; add.s64 %rd6840, %rd11830, 8; selp.b64 %rd11826, %rd6840, %rd11826, %p866; add.s64 %rd6841, %rd11836, 8; setp.eq.s64 %p868, %rd11833, %rd11839; selp.b64 %rd6842, %rd6841, %rd11833, %p868; add.s64 %rd6843, %rd11837, 8; selp.b64 %rd6844, %rd6843, %rd11834, %p868; add.s64 %rd6845, %rd11838, 8; selp.b64 %rd6846, %rd6845, %rd11835, %p868; selp.b64 %rd11836, %rd6841, %rd11836, %p868; selp.b64 %rd11837, %rd6843, %rd11837, %p868; selp.b64 %rd11838, %rd6845, %rd11838, %p868; add.s64 %rd6847, %rd11833, 8; selp.b64 %rd11839, %rd6847, %rd11839, %p868; add.s64 %rd6848, %rd6842, 4; add.s64 %rd6849, %rd6844, 4; add.s64 %rd6850, %rd6846, 4; selp.b64 %rd11833, %rd6842, %rd6848, %p867; selp.b64 %rd11834, %rd6844, %rd6849, %p867; selp.b64 %rd11835, %rd6846, %rd6850, %p867; ld.local.f32 %f2645, [%rd6844]; ld.local.f32 %f2646, [%rd6834]; setp.eq.f32 %p869, %f2646, %f2645; mov.u64 %rd11830, %rd1411; @%p869 bra $L__BB1_586; $L__BB1_588: mov.u64 %rd11077, 0; or.b64 %rd11858, %rd11077, %rd1394; mov.u32 %r4945, 2; bra.uni $L__BB1_593; $L__BB1_589: div.rn.f32 %f5362, %f404, %f417; fma.rn.f32 %f2647, %f395, %f5362, %f392; mov.b32 %r2500, %f2647; fma.rn.f32 %f2648, %f397, %f5362, %f393; mov.b32 %r2501, %f2648; cvt.u64.u32 %rd6852, %r2501; cvt.u64.u32 %rd6853, %r2500; bfi.b64 %rd1435, %rd6852, %rd6853, 32, 32; st.local.u64 [%rd6549], %rd1435; mov.u64 %rd11856, 2; mov.u64 %rd11842, %rd1164; mov.u64 %rd11843, %rd6561; mov.u64 %rd11844, %rd6561; mov.u64 %rd11845, %rd6560; mov.u64 %rd11846, %rd6561; mov.u64 %rd11847, %rd6561; mov.u64 %rd11848, %rd6560; mov.u64 %rd11849, %rd6549; mov.u64 %rd11850, %rd6549; mov.u64 %rd11851, %rd6548; mov.u64 %rd11852, %rd6549; mov.u64 %rd11853, %rd6549; mov.u64 %rd11854, %rd6548; mov.u64 %rd11855, %rd1161; $L__BB1_590: setp.eq.s64 %p870, %rd11856, 0; mov.u64 %rd11859, 1; @%p870 bra $L__BB1_592; add.s64 %rd11856, %rd11856, -1; add.s64 %rd6858, %rd11843, 8; setp.eq.s64 %p871, %rd11846, %rd11842; selp.b64 %rd6859, %rd6858, %rd11846, %p871; add.s64 %rd6860, %rd11844, 8; selp.b64 %rd6861, %rd6860, %rd11847, %p871; add.s64 %rd6862, %rd11845, 8; selp.b64 %rd6863, %rd6862, %rd11848, %p871; mov.u64 %rd11859, 0; setp.eq.s64 %p872, %rd11856, 0; add.s64 %rd6864, %rd6859, 4; add.s64 %rd6865, %rd6861, 4; add.s64 %rd6866, %rd6863, 4; selp.b64 %rd1452, %rd6859, %rd6864, %p872; selp.b64 %rd11847, %rd6861, %rd6865, %p872; selp.b64 %rd11848, %rd6863, %rd6866, %p872; selp.b64 %rd11843, %rd6858, %rd11843, %p871; selp.b64 %rd11844, %rd6860, %rd11844, %p871; selp.b64 %rd11845, %rd6862, %rd11845, %p871; add.s64 %rd6867, %rd11846, 8; selp.b64 %rd11842, %rd6867, %rd11842, %p871; add.s64 %rd6868, %rd11852, 8; setp.eq.s64 %p873, %rd11849, %rd11855; selp.b64 %rd6869, %rd6868, %rd11849, %p873; add.s64 %rd6870, %rd11853, 8; selp.b64 %rd6871, %rd6870, %rd11850, %p873; add.s64 %rd6872, %rd11854, 8; selp.b64 %rd6873, %rd6872, %rd11851, %p873; selp.b64 %rd11852, %rd6868, %rd11852, %p873; selp.b64 %rd11853, %rd6870, %rd11853, %p873; selp.b64 %rd11854, %rd6872, %rd11854, %p873; add.s64 %rd6874, %rd11849, 8; selp.b64 %rd11855, %rd6874, %rd11855, %p873; add.s64 %rd6875, %rd6869, 4; add.s64 %rd6876, %rd6871, 4; add.s64 %rd6877, %rd6873, 4; selp.b64 %rd11849, %rd6869, %rd6875, %p872; selp.b64 %rd11850, %rd6871, %rd6876, %p872; selp.b64 %rd11851, %rd6873, %rd6877, %p872; ld.local.f32 %f2649, [%rd6871]; ld.local.f32 %f2650, [%rd6861]; setp.eq.f32 %p874, %f2650, %f2649; mov.u64 %rd11846, %rd1452; @%p874 bra $L__BB1_590; $L__BB1_592: mov.u64 %rd11078, 0; or.b64 %rd11858, %rd11078, %rd1435; mov.u32 %r4945, 0; $L__BB1_593: mov.f32 %f2651, 0f3F800000; sub.f32 %f2652, %f2651, %f5362; mov.b32 %r2504, %f2652; mov.b32 %r2505, %f5362; cvt.u64.u32 %rd6878, %r2505; cvt.u64.u32 %rd6879, %r2504; bfi.b64 %rd11908, %rd6878, %rd6879, 32, 32; mov.b64 {%r2506, %r2507}, %rd11859; mov.b64 {%r2508, %r2509}, %rd11858; cvt.u32.u64 %r4943, %rd11858; mov.b32 %f5363, %r2509; mov.u32 %r4944, 1; mov.b32 {%rs1023, %rs408}, %r2506; bra.uni $L__BB1_606; $L__BB1_574: setp.lt.f32 %p858, %f424, %f425; @%p858 bra $L__BB1_576; bra.uni $L__BB1_575; $L__BB1_576: mul.f32 %f5360, %f401, %f421; fma.rn.f32 %f5358, %f399, %f421, %f392; mov.u32 %r4945, 2; mov.f32 %f5359, %f393; mov.f32 %f5361, %f421; bra.uni $L__BB1_580; $L__BB1_578: mul.f32 %f5360, %f415, %f5361; fma.rn.f32 %f5358, %f414, %f5361, %f394; mov.u32 %r4945, 1; bra.uni $L__BB1_580; $L__BB1_575: mul.f32 %f5360, %f415, %f5361; fma.rn.f32 %f5358, %f414, %f5361, %f394; mov.u32 %r4945, 1; $L__BB1_580: add.f32 %f5363, %f5359, %f5360; mov.f32 %f2635, 0f3F800000; sub.f32 %f2636, %f2635, %f5361; mov.b32 %r2492, %f2636; mov.b32 %r2493, %f5361; cvt.u64.u32 %rd6795, %r2493; cvt.u64.u32 %rd6796, %r2492; bfi.b64 %rd11908, %rd6795, %rd6796, 32, 32; mov.b32 %r4943, %f5358; mov.u32 %r4944, 1; mov.u16 %rs1023, 1; $L__BB1_606: mov.b32 %f2659, %r4943; sub.f32 %f2660, %f2659, %f389; sub.f32 %f2661, %f5363, %f390; mul.f32 %f2662, %f2661, %f2661; fma.rn.f32 %f2663, %f2660, %f2660, %f2662; add.f32 %f2664, %f2663, 0f00000000; sqrt.rn.f32 %f2665, %f2664; shl.b64 %rd6963, %rd1334, 2; add.s64 %rd6964, %rd24, %rd6963; st.local.f32 [%rd6964+-4], %f2665; mul.lo.s64 %rd6965, %rd1334, 36; add.s64 %rd6966, %rd7, %rd6965; st.local.u32 [%rd6966+-36], %r4943; st.local.f32 [%rd6966+-32], %f5363; mov.u16 %rs421, 0; st.local.v4.u8 [%rd6966+-28], {%rs1023, %rs421, %rs421, %rs421}; st.local.u32 [%rd6966+-24], %r406; st.local.u32 [%rd6966+-20], %r4944; st.local.u32 [%rd6966+-16], %r4945; shr.u64 %rd6967, %rd11908, 32; st.local.u32 [%rd6966+-8], %rd6967; st.local.u32 [%rd6966+-12], %rd11908; $L__BB1_607: setp.lt.u64 %p890, %rd1334, 4; add.s64 %rd1334, %rd1334, 1; @%p890 bra $L__BB1_554; ld.local.v2.u64 {%rd11909, %rd11910}, [%rd24]; ld.local.v4.u32 {%r4955, %r4956, %r4957, %r2537}, [%rd7]; ld.local.u32 %r4958, [%rd7+16]; ld.local.u32 %rd6970, [%rd1175+4]; ld.local.u32 %rd6971, [%rd1175+8]; bfi.b64 %rd6972, %rd6971, %rd6970, 32, 32; mov.b64 {%r4952, %r4953}, %rd6972; ld.local.u32 %r4954, [%rd1175+12]; ld.local.u32 %r4959, [%rd1176+4]; ld.local.u32 %r4951, [%rd1177+16]; ld.local.u64 %rd6973, [%rd1177+8]; mov.b64 {%r4949, %r4950}, %rd6973; ld.local.u32 %r4960, [%rd1178+8]; ld.local.u32 %rd6974, [%rd1179+12]; ld.local.u32 %rd6975, [%rd1179+16]; bfi.b64 %rd6976, %rd6975, %rd6974, 32, 32; mov.b64 {%r4946, %r4947}, %rd6976; ld.local.u32 %r4948, [%rd1179+20]; ld.local.u32 %r4961, [%rd1180+12]; bra.uni $L__BB1_609; $L__BB1_552: mov.u32 %r4958, 4; mov.u32 %r4959, %r4958; mov.u32 %r4960, %r4958; mov.u32 %r4961, %r4958; $L__BB1_609: and.b64 %rd6977, %rd1330, 1; setp.eq.b64 %p891, %rd6977, 1; mov.pred %p892, 0; xor.pred %p893, %p891, %p892; not.pred %p894, %p893; mov.b64 {%r449, %r450}, %rd11909; mov.b32 %f448, %r449; mov.b32 %f449, %r450; mov.b64 {%r451, %r452}, %rd11910; mov.b32 %f450, %r451; mov.b32 %f451, %r452; @%p894 bra $L__BB1_618; bra.uni $L__BB1_610; $L__BB1_618: and.b64 %rd6995, %rd1330, 2; setp.eq.s64 %p908, %rd6995, 0; @%p908 bra $L__BB1_627; bra.uni $L__BB1_619; $L__BB1_627: and.b64 %rd7013, %rd1330, 4; setp.eq.s64 %p922, %rd7013, 0; @%p922 bra $L__BB1_636; bra.uni $L__BB1_628; $L__BB1_636: and.b64 %rd7031, %rd1330, 8; setp.eq.s64 %p936, %rd7031, 0; @%p936 bra $L__BB1_544; ld.u8 %rs428, [%rd1319+88]; and.b16 %rs429, %rs428, 1; setp.eq.b16 %p937, %rs429, 1; mov.pred %p938, 0; xor.pred %p939, %p937, %p938; not.pred %p940, %p939; @%p940 bra $L__BB1_640; bra.uni $L__BB1_638; $L__BB1_640: ld.u32 %r500, [%rd1319+76]; cvt.u64.u32 %rd7035, %r500; setp.le.u64 %p947, %rd1307, %rd7035; @%p947 bra $L__BB1_544; neg.f32 %f455, %f451; setp.lt.u32 %p948, %r405, 64; @%p948 bra $L__BB1_643; bra.uni $L__BB1_642; $L__BB1_643: mul.wide.u32 %rd7047, %r405, 8; add.s64 %rd7048, %rd1308, %rd7047; mov.u64 %rd11917, 0; st.local.u32 [%rd7048], %r500; st.local.f32 [%rd7048+4], %f455; add.s32 %r405, %r405, 1; st.local.u32 [%rd1308+512], %r405; mov.u64 %rd11918, %rd11917; bra.uni $L__BB1_644; $L__BB1_610: ld.u8 %rs422, [%rd1319+88]; and.b16 %rs423, %rs422, 1; setp.eq.b16 %p895, %rs423, 1; xor.pred %p897, %p895, %p892; not.pred %p898, %p897; @%p898 bra $L__BB1_613; bra.uni $L__BB1_611; $L__BB1_613: ld.u32 %r458, [%rd1319+64]; cvt.u64.u32 %rd6981, %r458; setp.le.u64 %p905, %rd1307, %rd6981; @%p905 bra $L__BB1_618; neg.f32 %f452, %f448; setp.lt.u32 %p906, %r405, 64; @%p906 bra $L__BB1_616; bra.uni $L__BB1_615; $L__BB1_616: add.s32 %r2540, %r404, -1; mul.wide.u32 %rd6993, %r2540, 8; add.s64 %rd6994, %rd1308, %rd6993; mov.u64 %rd11911, 0; st.local.u32 [%rd6994], %r458; st.local.f32 [%rd6994+4], %f452; add.s32 %r405, %r405, 1; st.local.u32 [%rd1308+512], %r405; mov.u64 %rd11912, %rd11911; bra.uni $L__BB1_617; $L__BB1_619: ld.u8 %rs424, [%rd1319+88]; and.b16 %rs425, %rs424, 1; setp.eq.b16 %p909, %rs425, 1; mov.pred %p910, 0; xor.pred %p911, %p909, %p910; not.pred %p912, %p911; @%p912 bra $L__BB1_622; bra.uni $L__BB1_620; $L__BB1_622: ld.u32 %r472, [%rd1319+68]; cvt.u64.u32 %rd6999, %r472; setp.le.u64 %p919, %rd1307, %rd6999; @%p919 bra $L__BB1_627; neg.f32 %f453, %f449; setp.lt.u32 %p920, %r405, 64; @%p920 bra $L__BB1_625; bra.uni $L__BB1_624; $L__BB1_625: mul.wide.u32 %rd7011, %r405, 8; add.s64 %rd7012, %rd1308, %rd7011; mov.u64 %rd11913, 0; st.local.u32 [%rd7012], %r472; st.local.f32 [%rd7012+4], %f453; add.s32 %r405, %r405, 1; st.local.u32 [%rd1308+512], %r405; mov.u64 %rd11914, %rd11913; bra.uni $L__BB1_626; $L__BB1_628: ld.u8 %rs426, [%rd1319+88]; and.b16 %rs427, %rs426, 1; setp.eq.b16 %p923, %rs427, 1; mov.pred %p924, 0; xor.pred %p925, %p923, %p924; not.pred %p926, %p925; @%p926 bra $L__BB1_631; bra.uni $L__BB1_629; $L__BB1_631: ld.u32 %r486, [%rd1319+72]; cvt.u64.u32 %rd7017, %r486; setp.le.u64 %p933, %rd1307, %rd7017; @%p933 bra $L__BB1_636; neg.f32 %f454, %f450; setp.lt.u32 %p934, %r405, 64; @%p934 bra $L__BB1_634; bra.uni $L__BB1_633; $L__BB1_634: mul.wide.u32 %rd7029, %r405, 8; add.s64 %rd7030, %rd1308, %rd7029; mov.u64 %rd11915, 0; st.local.u32 [%rd7030], %r486; st.local.f32 [%rd7030+4], %f454; add.s32 %r405, %r405, 1; st.local.u32 [%rd1308+512], %r405; mov.u64 %rd11916, %rd11915; bra.uni $L__BB1_635; $L__BB1_611: setp.leu.f32 %p899, %f391, %f448; setp.eq.s32 %p900, %r4958, 4; or.pred %p901, %p900, %p899; @%p901 bra $L__BB1_618; ld.u32 %r2538, [%rd1319+64]; cvt.u64.u32 %rd6978, %r2538; setp.le.u64 %p902, %rd1310, %rd6978; mul.wide.u32 %rd6979, %r2538, 12; add.s64 %rd6980, %rd1311, %rd6979; setp.eq.s64 %p903, %rd6980, 0; or.pred %p904, %p902, %p903; selp.b32 %r400, %r400, %r4957, %p904; selp.b32 %r399, %r399, %r4956, %p904; selp.b32 %r398, %r398, %r4955, %p904; selp.b32 %r402, %r402, %r4958, %p904; selp.b32 %r403, %r403, %r449, %p904; bra.uni $L__BB1_618; $L__BB1_620: mov.b32 %f2666, %r403; setp.leu.f32 %p913, %f2666, %f449; setp.eq.s32 %p914, %r4959, 4; or.pred %p915, %p914, %p913; @%p915 bra $L__BB1_627; ld.u32 %r2546, [%rd1319+68]; cvt.u64.u32 %rd6996, %r2546; setp.le.u64 %p916, %rd1310, %rd6996; mul.wide.u32 %rd6997, %r2546, 12; add.s64 %rd6998, %rd1311, %rd6997; setp.eq.s64 %p917, %rd6998, 0; or.pred %p918, %p916, %p917; selp.b32 %r400, %r400, %r4954, %p918; selp.b32 %r399, %r399, %r4953, %p918; selp.b32 %r398, %r398, %r4952, %p918; selp.b32 %r402, %r402, %r4959, %p918; selp.b32 %r403, %r403, %r450, %p918; bra.uni $L__BB1_627; $L__BB1_629: mov.b32 %f2667, %r403; setp.leu.f32 %p927, %f2667, %f450; setp.eq.s32 %p928, %r4960, 4; or.pred %p929, %p928, %p927; @%p929 bra $L__BB1_636; ld.u32 %r2553, [%rd1319+72]; cvt.u64.u32 %rd7014, %r2553; setp.le.u64 %p930, %rd1310, %rd7014; mul.wide.u32 %rd7015, %r2553, 12; add.s64 %rd7016, %rd1311, %rd7015; setp.eq.s64 %p931, %rd7016, 0; or.pred %p932, %p930, %p931; selp.b32 %r400, %r400, %r4951, %p932; selp.b32 %r399, %r399, %r4950, %p932; selp.b32 %r398, %r398, %r4949, %p932; selp.b32 %r402, %r402, %r4960, %p932; selp.b32 %r403, %r403, %r451, %p932; bra.uni $L__BB1_636; $L__BB1_638: mov.b32 %f2668, %r403; setp.leu.f32 %p941, %f2668, %f451; setp.eq.s32 %p942, %r4961, 4; or.pred %p943, %p942, %p941; @%p943 bra $L__BB1_544; bra.uni $L__BB1_639; $L__BB1_615: mov.u64 %rd11912, 1; shl.b64 %rd11911, %rd6981, 32; $L__BB1_617: mov.u64 %rd11082, 0; cvt.u32.u64 %r2541, %rd11082; cvt.u32.u64 %r2542, %rd11911; or.b32 %r2543, %r2542, %r2541; cvt.u32.u64 %r2544, %rd11912; or.b32 %r2545, %r2543, %r2544; setp.ne.s32 %p907, %r2545, 0; @%p907 bra $L__BB1_645; bra.uni $L__BB1_618; $L__BB1_624: mov.u64 %rd11914, 1; shl.b64 %rd11913, %rd6999, 32; $L__BB1_626: mov.u64 %rd11085, 0; cvt.u32.u64 %r2548, %rd11085; cvt.u32.u64 %r2549, %rd11913; or.b32 %r2550, %r2549, %r2548; cvt.u32.u64 %r2551, %rd11914; or.b32 %r2552, %r2550, %r2551; setp.ne.s32 %p921, %r2552, 0; @%p921 bra $L__BB1_645; bra.uni $L__BB1_627; $L__BB1_633: mov.u64 %rd11916, 1; shl.b64 %rd11915, %rd7017, 32; $L__BB1_635: mov.u64 %rd11088, 0; cvt.u32.u64 %r2555, %rd11088; cvt.u32.u64 %r2556, %rd11915; or.b32 %r2557, %r2556, %r2555; cvt.u32.u64 %r2558, %rd11916; or.b32 %r2559, %r2557, %r2558; setp.ne.s32 %p935, %r2559, 0; @%p935 bra $L__BB1_645; bra.uni $L__BB1_636; $L__BB1_642: mov.u64 %rd11918, 1; shl.b64 %rd11917, %rd7035, 32; $L__BB1_644: mov.u64 %rd11091, 0; cvt.u32.u64 %r2562, %rd11091; cvt.u32.u64 %r2563, %rd11917; or.b32 %r2564, %r2563, %r2562; cvt.u32.u64 %r2565, %rd11918; or.b32 %r2566, %r2564, %r2565; setp.eq.s32 %p949, %r2566, 0; @%p949 bra $L__BB1_544; bra.uni $L__BB1_645; $L__BB1_646: setp.eq.s32 %p950, %r402, 4; mov.u64 %rd11919, %rd6703; mov.u64 %rd11920, %rd6701; mov.u64 %rd11921, %rd6703; @%p950 bra $L__BB1_648; mov.b64 %rd11921, {%r398, %r399}; mov.b32 {%rs430, %rs431}, %r400; mov.b64 %rd7055, {%r400, %r2567}; and.b64 %rd11919, %rd7055, 4294967040; cvt.u64.u16 %rd7056, %rs430; and.b64 %rd11920, %rd7056, 255; $L__BB1_648: or.b64 %rd7063, %rd11920, %rd11919; or.b64 %rd7064, %rd7063, %rd6703; mov.b64 {%r2568, %r2569}, %rd7064; mov.b32 {%rs64, %rs432}, %r2568; and.b16 %rs433, %rs64, 255; setp.eq.s16 %p951, %rs433, 2; @%p951 bra $L__BB1_650; cvt.u32.u64 %r2570, %rd11921; mov.b32 %f2669, %r2570; shr.u64 %rd7065, %rd11921, 32; cvt.u32.u64 %r2571, %rd7065; mov.b32 %f2670, %r2571; ld.global.f32 %f2671, [%rd1182+248]; mul.f32 %f2672, %f2671, %f2669; ld.global.f32 %f2673, [%rd1182+252]; mul.f32 %f2674, %f2673, %f2670; sub.f32 %f2675, %f2672, %f2674; mul.f32 %f2676, %f2673, %f2669; fma.rn.f32 %f2677, %f2671, %f2670, %f2676; ld.global.f32 %f2678, [%rd1182+256]; add.f32 %f2679, %f2678, %f2675; mov.b32 %r2572, %f2679; ld.global.f32 %f2680, [%rd1182+260]; add.f32 %f2681, %f2680, %f2677; mov.b32 %r2573, %f2681; cvt.u64.u32 %rd7066, %r2573; cvt.u64.u32 %rd7067, %r2572; cvt.u64.u16 %rd7068, %rs64; bfi.b64 %rd6703, %rd7066, %rd7067, 32, 32; and.b64 %rd7069, %rd7068, 255; mov.b64 {%r2574, %r2575}, %rd7069; mov.b32 {%rs434, %rs435}, %r2574; cvt.u64.u16 %rd6701, %rs434; $L__BB1_650: mov.u64 %rd11100, 0; or.b64 %rd7076, %rd11100, %rd6701; or.b64 %rd1658, %rd7076, %rd11100; mov.b64 {%r2576, %r2577}, %rd1658; mov.b32 {%rs65, %rs436}, %r2576; and.b16 %rs437, %rs65, 255; setp.eq.s16 %p952, %rs437, 2; mov.u64 %rd11924, 2; mov.u64 %rd11925, %rd11100; mov.u64 %rd11926, %rd11100; @%p952 bra $L__BB1_652; and.b64 %rd7078, %rd1658, 4294967040; cvt.u64.u16 %rd7079, %rs65; and.b64 %rd7080, %rd7079, 255; or.b64 %rd7081, %rd7080, %rd11100; or.b64 %rd7082, %rd7081, %rd7078; mov.b64 {%r2578, %r2579}, %rd7082; mov.b32 {%rs438, %rs439}, %r2578; not.b16 %rs440, %rs438; ld.global.u8 %rs441, [%rd1182+240]; setp.eq.s16 %p953, %rs441, 0; and.b16 %rs442, %rs440, 1; selp.b16 %rs443, %rs438, %rs442, %p953; and.b64 %rd7083, %rd7082, 4294967040; cvt.u64.u16 %rd7084, %rs443; and.b64 %rd7085, %rd7084, 255; or.b64 %rd7086, %rd7083, %rd11100; or.b64 %rd7087, %rd7086, %rd7085; mov.b64 {%r2580, %r2581}, %rd7087; mov.b32 {%rs444, %rs445}, %r2580; and.b64 %rd11926, %rd7087, 4294967040; cvt.u64.u16 %rd7088, %rs444; and.b64 %rd11924, %rd7088, 255; mov.u64 %rd11925, %rd6703; $L__BB1_652: or.b64 %rd7089, %rd11925, %rd11100; or.b64 %rd7090, %rd11100, %rd11924; or.b64 %rd7091, %rd7090, %rd11926; or.b64 %rd7092, %rd7089, %rd11100; mov.b64 {%r4992, %r4993}, %rd7092; mov.b64 {%r4994, %r2582}, %rd7091; bra.uni $L__BB1_709; $L__BB1_512: cvt.u32.u64 %r2375, %rd1187; cvt.u32.u64 %r2376, %rd1206; rem.u32 %r2377, %r2376, %r2375; cvt.u64.u32 %rd11766, %r2377; $L__BB1_513: shl.b64 %rd6606, %rd11766, 3; add.s64 %rd1210, %rd1188, %rd6606; ld.u32 %rd6607, [%rd1210]; ld.u32 %rd6608, [%rd1210+4]; bfi.b64 %rd1211, %rd6608, %rd6607, 32, 32; add.s64 %rd1212, %rd11766, 1; or.b64 %rd6609, %rd1212, %rd1187; and.b64 %rd6610, %rd6609, -4294967296; setp.eq.s64 %p770, %rd6610, 0; @%p770 bra $L__BB1_515; rem.u64 %rd11767, %rd1212, %rd1187; bra.uni $L__BB1_516; $L__BB1_515: cvt.u32.u64 %r2378, %rd1187; cvt.u32.u64 %r2379, %rd1212; rem.u32 %r2380, %r2379, %r2378; cvt.u64.u32 %rd11767, %r2380; $L__BB1_516: add.u64 %rd11777, %SP, 560; cvta.to.local.u64 %rd11775, %rd11777; shl.b64 %rd6612, %rd11767, 3; add.s64 %rd1222, %rd1188, %rd6612; ld.u32 %rd6613, [%rd1222]; ld.u32 %rd6614, [%rd1222+4]; bfi.b64 %rd6615, %rd6614, %rd6613, 32, 32; st.local.v2.u64 [%rd11775], {%rd1211, %rd6615}; mov.u64 %rd11782, 2; mov.u64 %rd11768, %rd1170; mov.u64 %rd11769, %rd1168; mov.u64 %rd11770, %rd1168; mov.u64 %rd11771, %rd1169; mov.u64 %rd11772, %rd1168; mov.u64 %rd11773, %rd1168; mov.u64 %rd11774, %rd1169; mov.u64 %rd11776, %rd11775; mov.u64 %rd11778, %rd11775; mov.u64 %rd11779, %rd11775; mov.u64 %rd11780, %rd11777; mov.u64 %rd11781, %rd1171; $L__BB1_517: setp.eq.s64 %p771, %rd11782, 0; @%p771 bra $L__BB1_520; add.s64 %rd11782, %rd11782, -1; add.s64 %rd6616, %rd11769, 8; setp.eq.s64 %p772, %rd11772, %rd11768; selp.b64 %rd6617, %rd6616, %rd11772, %p772; add.s64 %rd6618, %rd11770, 8; selp.b64 %rd6619, %rd6618, %rd11773, %p772; add.s64 %rd6620, %rd11771, 8; selp.b64 %rd6621, %rd6620, %rd11774, %p772; setp.eq.s64 %p773, %rd11782, 0; add.s64 %rd6622, %rd6617, 4; add.s64 %rd6623, %rd6619, 4; add.s64 %rd6624, %rd6621, 4; selp.b64 %rd1239, %rd6617, %rd6622, %p773; selp.b64 %rd11773, %rd6619, %rd6623, %p773; selp.b64 %rd11774, %rd6621, %rd6624, %p773; selp.b64 %rd11769, %rd6616, %rd11769, %p772; selp.b64 %rd11770, %rd6618, %rd11770, %p772; selp.b64 %rd11771, %rd6620, %rd11771, %p772; add.s64 %rd6625, %rd11772, 8; selp.b64 %rd11768, %rd6625, %rd11768, %p772; add.s64 %rd6626, %rd11778, 8; setp.eq.s64 %p774, %rd11775, %rd11781; selp.b64 %rd6627, %rd6626, %rd11775, %p774; add.s64 %rd6628, %rd11779, 8; selp.b64 %rd6629, %rd6628, %rd11776, %p774; add.s64 %rd6630, %rd11780, 8; selp.b64 %rd6631, %rd6630, %rd11777, %p774; selp.b64 %rd11778, %rd6626, %rd11778, %p774; selp.b64 %rd11779, %rd6628, %rd11779, %p774; selp.b64 %rd11780, %rd6630, %rd11780, %p774; add.s64 %rd6632, %rd11775, 8; selp.b64 %rd11781, %rd6632, %rd11781, %p774; add.s64 %rd6633, %rd6627, 4; add.s64 %rd6634, %rd6629, 4; add.s64 %rd6635, %rd6631, 4; selp.b64 %rd11775, %rd6627, %rd6633, %p773; selp.b64 %rd11776, %rd6629, %rd6634, %p773; selp.b64 %rd11777, %rd6631, %rd6635, %p773; ld.local.f32 %f2435, [%rd6629]; ld.local.f32 %f2436, [%rd6619]; setp.eq.f32 %p775, %f2436, %f2435; mov.u64 %rd11772, %rd1239; @%p775 bra $L__BB1_517; bra.uni $L__BB1_519; $L__BB1_520: ld.u32 %rd6636, [%rd1210]; ld.u32 %rd6637, [%rd1210+4]; bfi.b64 %rd6638, %rd6637, %rd6636, 32, 32; cvt.u32.u64 %r2381, %rd6638; mov.b32 %f2437, %r2381; shr.u64 %rd6639, %rd6638, 32; cvt.u32.u64 %r2382, %rd6639; mov.b32 %f2438, %r2382; ld.u32 %rd6640, [%rd1222]; ld.u32 %rd6641, [%rd1222+4]; bfi.b64 %rd6642, %rd6641, %rd6640, 32, 32; cvt.u32.u64 %r2383, %rd6642; shr.u64 %rd6643, %rd6642, 32; cvt.u32.u64 %r2384, %rd6643; mov.b32 %f2439, %r2383; sub.f32 %f5356, %f2439, %f2437; mov.b32 %f2440, %r2384; sub.f32 %f5357, %f2440, %f2438; bra.uni $L__BB1_531; $L__BB1_525: cvt.u32.u64 %r2385, %rd1187; cvt.u32.u64 %r2386, %rd1253; rem.u32 %r2387, %r2386, %r2385; cvt.u64.u32 %rd11783, %r2387; $L__BB1_526: shl.b64 %rd6652, %rd11783, 3; add.s64 %rd6653, %rd1188, %rd6652; ld.u32 %rd6654, [%rd6653]; ld.u32 %rd6655, [%rd6653+4]; bfi.b64 %rd1264, %rd6655, %rd6654, 32, 32; add.u64 %rd6657, %SPL, 560; st.local.v2.u64 [%rd6657], {%rd1254, %rd1264}; mov.u64 %rd11798, 2; mov.u64 %rd11784, %rd1168; mov.u64 %rd11785, %rd1165; mov.u64 %rd11786, %rd1165; mov.u64 %rd11787, %rd1167; mov.u64 %rd11788, %rd1165; mov.u64 %rd11789, %rd1165; mov.u64 %rd11790, %rd1167; mov.u64 %rd11791, %rd1172; mov.u64 %rd11792, %rd1172; mov.u64 %rd11793, %rd1173; mov.u64 %rd11794, %rd1172; mov.u64 %rd11795, %rd1172; mov.u64 %rd11796, %rd1173; mov.u64 %rd11797, %rd1174; $L__BB1_527: setp.eq.s64 %p779, %rd11798, 0; @%p779 bra $L__BB1_530; add.s64 %rd11798, %rd11798, -1; add.s64 %rd6658, %rd11785, 8; setp.eq.s64 %p780, %rd11788, %rd11784; selp.b64 %rd6659, %rd6658, %rd11788, %p780; add.s64 %rd6660, %rd11786, 8; selp.b64 %rd6661, %rd6660, %rd11789, %p780; add.s64 %rd6662, %rd11787, 8; selp.b64 %rd6663, %rd6662, %rd11790, %p780; setp.eq.s64 %p781, %rd11798, 0; add.s64 %rd6664, %rd6659, 4; add.s64 %rd6665, %rd6661, 4; add.s64 %rd6666, %rd6663, 4; selp.b64 %rd1281, %rd6659, %rd6664, %p781; selp.b64 %rd11789, %rd6661, %rd6665, %p781; selp.b64 %rd11790, %rd6663, %rd6666, %p781; selp.b64 %rd11785, %rd6658, %rd11785, %p780; selp.b64 %rd11786, %rd6660, %rd11786, %p780; selp.b64 %rd11787, %rd6662, %rd11787, %p780; add.s64 %rd6667, %rd11788, 8; selp.b64 %rd11784, %rd6667, %rd11784, %p780; add.s64 %rd6668, %rd11794, 8; setp.eq.s64 %p782, %rd11791, %rd11797; selp.b64 %rd6669, %rd6668, %rd11791, %p782; add.s64 %rd6670, %rd11795, 8; selp.b64 %rd6671, %rd6670, %rd11792, %p782; add.s64 %rd6672, %rd11796, 8; selp.b64 %rd6673, %rd6672, %rd11793, %p782; selp.b64 %rd11794, %rd6668, %rd11794, %p782; selp.b64 %rd11795, %rd6670, %rd11795, %p782; selp.b64 %rd11796, %rd6672, %rd11796, %p782; add.s64 %rd6674, %rd11791, 8; selp.b64 %rd11797, %rd6674, %rd11797, %p782; add.s64 %rd6675, %rd6669, 4; add.s64 %rd6676, %rd6671, 4; add.s64 %rd6677, %rd6673, 4; selp.b64 %rd11791, %rd6669, %rd6675, %p781; selp.b64 %rd11792, %rd6671, %rd6676, %p781; selp.b64 %rd11793, %rd6673, %rd6677, %p781; ld.local.f32 %f2441, [%rd6671]; ld.local.f32 %f2442, [%rd6661]; setp.eq.f32 %p783, %f2442, %f2441; mov.u64 %rd11788, %rd1281; @%p783 bra $L__BB1_527; bra.uni $L__BB1_529; $L__BB1_530: cvt.u32.u64 %r2388, %rd1254; mov.b32 %f2443, %r2388; shr.u64 %rd6678, %rd1254, 32; cvt.u32.u64 %r2389, %rd6678; mov.b32 %f2444, %r2389; shr.u64 %rd6679, %rd1264, 32; cvt.u32.u64 %r2390, %rd6679; cvt.u32.u64 %r2391, %rd1264; mov.b32 %f2445, %r2391; sub.f32 %f2446, %f2445, %f2443; mov.b32 %f2447, %r2390; sub.f32 %f2448, %f2447, %f2444; neg.f32 %f5356, %f2446; neg.f32 %f5357, %f2448; $L__BB1_531: mul.f32 %f2449, %f381, %f5357; fma.rn.f32 %f388, %f380, %f5356, %f2449; mul.f32 %f2450, %f5357, %f5357; fma.rn.f32 %f2451, %f5356, %f5356, %f2450; add.f32 %f2452, %f2451, 0f00000000; sqrt.rn.f32 %f2453, %f2452; mul.f32 %f2454, %f2453, 0f3A83126F; abs.f32 %f2455, %f388; setp.gt.f32 %p784, %f2455, %f2454; @%p784 bra $L__BB1_533; bra.uni $L__BB1_532; $L__BB1_533: setp.ge.f32 %p2920, %f388, 0f00000000; bra.uni $L__BB1_536; $L__BB1_532: ld.local.u64 %rd6680, [%rd1186+8]; cvt.u32.u64 %r2392, %rd6680; mov.b32 %f2456, %r2392; shr.u64 %rd6681, %rd6680, 32; cvt.u32.u64 %r2393, %rd6681; mov.b32 %f2457, %r2393; sub.f32 %f2458, %f179, %f2456; sub.f32 %f2459, %f180, %f2457; mul.f32 %f2460, %f381, %f2459; fma.rn.f32 %f2461, %f380, %f2458, %f2460; setp.le.f32 %p2920, %f2461, 0f00000000; $L__BB1_536: selp.u16 %rs381, 1, 0, %p2920; st.local.u8 [%rd1186+16], %rs381; $L__BB1_537: ld.local.v2.u32 {%r4930, %r4931}, [%rd1186+8]; ld.local.u32 %r4932, [%rd1186+16]; $L__BB1_539: setp.eq.s32 %p785, %r383, 2; mov.u64 %rd6689, 0; mov.u64 %rd11799, 2; mov.u64 %rd11800, %rd6689; @%p785 bra $L__BB1_541; setp.ne.s16 %p786, %rs49, 0; cvt.u16.u32 %rs383, %r4932; selp.u16 %rs384, 1, 0, %p786; xor.b16 %rs385, %rs383, %rs384; mov.b32 %f2468, %r4930; mov.b32 %f2469, %r4931; mul.f32 %f2470, %f351, %f2468; ld.global.f32 %f2471, [%rd1182+252]; mul.f32 %f2472, %f2471, %f2469; sub.f32 %f2473, %f2470, %f2472; mul.f32 %f2474, %f2471, %f2468; fma.rn.f32 %f2475, %f351, %f2469, %f2474; add.f32 %f2476, %f349, %f2473; mov.b32 %r2398, %f2476; add.f32 %f2477, %f350, %f2475; mov.b32 %r2399, %f2477; cvt.u64.u32 %rd6690, %r2399; cvt.u64.u32 %rd6691, %r2398; cvt.u64.u16 %rd6692, %rs385; bfi.b64 %rd11800, %rd6690, %rd6691, 32, 32; and.b64 %rd6693, %rd6692, 255; mov.b64 {%r2400, %r2401}, %rd6693; mov.b32 {%rs386, %rs387}, %r2400; cvt.u64.u16 %rd11799, %rs386; $L__BB1_541: or.b64 %rd6694, %rd6689, %rd6689; or.b64 %rd6695, %rd11799, %rd6689; or.b64 %rd6696, %rd6695, %rd6689; or.b64 %rd6697, %rd6694, %rd11800; mov.b64 {%r4992, %r4993}, %rd6697; mov.b64 {%r4994, %r2402}, %rd6696; $L__BB1_709: mov.b32 {%rs70, %rs455}, %r4994; and.b16 %rs456, %rs70, 255; setp.eq.s16 %p1031, %rs456, 2; @%p1031 bra $L__BB1_711; mov.b64 %rd7172, {%r4994, %r2650}; shr.u64 %rd7173, %rd7172, 8; and.b64 %rd7174, %rd7173, 16777215; cvt.u64.u16 %rd7175, %rs70; and.b64 %rd7176, %rd7175, 255; mov.b64 %rd7177, {%r4992, %r4993}; bfi.b64 %rd7178, %rd7174, %rd7176, 8, 56; mov.b64 {%r2651, %r2652}, %rd7178; mov.b32 {%rs457, %rs458}, %r2651; shr.u64 %rd7179, %rd7177, 32; cvt.u32.u64 %r2653, %rd7179; mov.b32 %f2782, %r4992; sub.f32 %f2783, %f2782, %f179; mov.b32 %f2784, %r2653; sub.f32 %f2785, %f2784, %f180; mul.f32 %f2786, %f2785, %f2785; fma.rn.f32 %f2787, %f2783, %f2783, %f2786; add.f32 %f2788, %f2787, 0f00000000; sqrt.rn.f32 %f2789, %f2788; and.b16 %rs459, %rs457, 1; setp.eq.b16 %p1032, %rs459, 1; selp.f32 %f2790, 0fBF800000, 0f3F800000, %p1032; mul.f32 %f2791, %f2790, %f2789; setp.ge.f32 %p1033, %f2791, %f517; setp.le.f32 %p1034, %f2791, %f517; selp.b16 %rs460, 1, 2, %p1034; setp.gtu.f32 %p1035, %f2791, %f517; selp.b16 %rs461, -1, 0, %p1035; selp.b16 %rs462, %rs461, %rs460, %p1033; setp.eq.s16 %p1036, %rs462, 1; selp.f32 %f517, %f2791, %f517, %p1036; $L__BB1_711: add.s64 %rd1183, %rd1183, 280; setp.ne.s64 %p1037, %rd607, 0; add.s64 %rd1182, %rd1182, 280; @%p1037 bra $L__BB1_481; $L__BB1_712: cvta.to.global.u64 %rd2320, %rd5233; sub.f32 %f518, %f2, %f177; sub.f32 %f519, %f3, %f178; mov.u32 %r2657, 2; mov.u64 %rd7182, 0; mov.u64 %rd2321, %rd5233; mov.u64 %rd1745, %rd7182; mov.u64 %rd7207, %rd7182; @%p423 bra $L__BB1_949; ld.param.u64 %rd1745, [grid_update_param_3]; add.u64 %rd7183, %SP, 560; add.u64 %rd7184, %SPL, 560; add.s64 %rd1717, %rd7184, 8; add.u64 %rd7187, %SP, 0; add.u64 %rd7188, %SPL, 0; add.s64 %rd1718, %rd7188, 8; add.s64 %rd1719, %rd7188, 8; add.s64 %rd1720, %rd7188, 8; add.s64 %rd1721, %rd7188, 8; add.s64 %rd1722, %rd7188, 8; add.s64 %rd1723, %rd7188, 8; add.u64 %rd7199, %SP, 552; add.u64 %rd7200, %SPL, 552; add.s64 %rd1724, %rd7200, 8; add.u64 %rd7201, %SP, 32; add.u64 %rd7202, %SPL, 32; add.s64 %rd1725, %rd7202, 36; add.s64 %rd1726, %rd7202, 4; add.s64 %rd1727, %rd7201, 36; add.s64 %rd1728, %rd7202, 44; add.s64 %rd1729, %rd7201, 44; add.s64 %rd1730, %rd7202, 52; add.s64 %rd1731, %rd7184, 8; add.s64 %rd1732, %rd7184, 8; or.b64 %rd1733, %rd7183, 8; add.s64 %rd1734, %rd7184, 16; add.s64 %rd1735, %rd7, 32; add.s64 %rd1736, %rd7, 48; add.s64 %rd1737, %rd7, 64; add.s64 %rd1738, %rd7, 80; add.s64 %rd1739, %rd7, 96; add.s64 %rd1740, %rd7, 112; cvta.to.global.u64 %rd11941, %rd5233; mov.u64 %rd11942, %rd5233; $L__BB1_714: mov.u64 %rd1744, %rd11942; mov.u64 %rd1743, %rd11941; add.s64 %rd1745, %rd1745, -1; setp.eq.s64 %p1039, %rd1744, 0; @%p1039 bra $L__BB1_948; add.s64 %rd1746, %rd1743, 272; ld.global.u32 %r2661, [%rd1743+272]; mov.u64 %rd7207, 0; setp.eq.s32 %p1040, %r2661, 3; mov.u32 %r2660, 2; @%p1040 bra $L__BB1_945; ld.global.u16 %rs463, [%rd1746+-272]; setp.eq.s16 %p1041, %rs463, 1; @%p1041 bra $L__BB1_887; setp.eq.s16 %p1042, %rs463, 2; @%p1042 bra $L__BB1_776; setp.ne.s16 %p1043, %rs463, 3; @%p1043 bra $L__BB1_925; ld.global.u8 %rs71, [%rd1746+-248]; ld.global.f32 %f520, [%rd1746+-16]; sub.f32 %f2792, %f518, %f520; ld.global.f32 %f521, [%rd1746+-12]; sub.f32 %f2793, %f519, %f521; ld.global.f32 %f2794, [%rd1746+-20]; ld.global.f32 %f522, [%rd1746+-24]; mul.f32 %f2795, %f2793, %f2794; fma.rn.f32 %f523, %f2792, %f522, %f2795; mul.f32 %f2796, %f2792, %f2794; mul.f32 %f2797, %f2793, %f522; sub.f32 %f524, %f2797, %f2796; cvta.to.local.u64 %rd1747, %rd7201; mov.u32 %r558, 2; st.local.u32 [%rd1747+20], %r558; ld.global.u64 %rd1748, [%rd1746+-256]; setp.eq.s64 %p1044, %rd1748, 0; @%p1044 bra $L__BB1_773; mov.b32 %r2676, %f524; ld.global.u64 %rd1749, [%rd1746+-264]; mov.b32 %r2677, %f523; and.b32 %r2678, %r2677, 2147483647; mov.b32 %f525, %r2678; and.b32 %r2679, %r2676, 2147483647; mov.b32 %f526, %r2679; mov.u64 %rd11944, 1; bra.uni $L__BB1_721; $L__BB1_729: sub.f32 %f2809, %f5374, %f523; abs.f32 %f541, %f2809; setp.le.f32 %p1054, %f541, 0f34000000; @%p1054 bra $L__BB1_731; abs.f32 %f2810, %f5374; abs.f32 %f2811, %f523; setp.gt.f32 %p1056, %f2811, %f2810; selp.f32 %f2812, %f2811, %f2810, %p1056; mul.f32 %f2813, %f2812, 0f34000000; setp.gtu.f32 %p1057, %f541, %f2813; @%p1057 bra $L__BB1_735; bra.uni $L__BB1_731; $L__BB1_721: shl.b64 %rd7212, %rd11944, 3; add.s64 %rd7213, %rd1749, %rd7212; setp.eq.s64 %p1045, %rd11944, %rd1748; selp.b64 %rd7214, 0, %rd11944, %p1045; shl.b64 %rd7215, %rd7214, 3; add.s64 %rd7216, %rd1749, %rd7215; ld.u32 %rd7217, [%rd7213+-8]; ld.u32 %rd7218, [%rd7213+-4]; bfi.b64 %rd1752, %rd7218, %rd7217, 32, 32; ld.u32 %rd7219, [%rd7216]; ld.u32 %rd7220, [%rd7216+4]; bfi.b64 %rd1753, %rd7220, %rd7219, 32, 32; cvt.u32.u64 %r4997, %rd1752; mov.b32 %f5374, %r4997; shr.u64 %rd7221, %rd1752, 32; cvt.u32.u64 %r2682, %rd7221; mov.b32 %f529, %r2682; cvt.u32.u64 %r542, %rd1753; shr.u64 %rd7222, %rd1753, 32; cvt.u32.u64 %r2683, %rd7222; mov.b32 %f530, %r542; sub.f32 %f531, %f530, %f5374; mov.b32 %f2799, %r2683; sub.f32 %f532, %f2799, %f529; sub.f32 %f2800, %f523, %f5374; sub.f32 %f2801, %f524, %f529; mul.f32 %f2802, %f532, %f2801; fma.rn.f32 %f533, %f531, %f2800, %f2802; mul.f32 %f2803, %f532, %f532; fma.rn.f32 %f2804, %f531, %f531, %f2803; add.f32 %f534, %f2804, 0f00000000; setp.gtu.f32 %p1046, %f533, 0f00000000; mov.b64 {%r2684, %r4998}, %rd1752; mov.b64 {%r2685, %r544}, %rd1753; @%p1046 bra $L__BB1_723; bra.uni $L__BB1_722; $L__BB1_723: setp.ltu.f32 %p1047, %f533, %f534; @%p1047 bra $L__BB1_725; bra.uni $L__BB1_724; $L__BB1_725: setp.eq.f32 %p1048, %f534, 0f00000000; @%p1048 bra $L__BB1_772; div.rn.f32 %f2805, %f533, %f534; mov.f32 %f2806, 0f3F800000; sub.f32 %f2807, %f2806, %f2805; mov.b32 %r5000, %f2807; mov.b32 %r5001, %f2805; fma.rn.f32 %f5374, %f531, %f2805, %f5374; mov.b32 %r4997, %f5374; fma.rn.f32 %f5375, %f532, %f2805, %f529; mov.b32 %r4998, %f5375; mov.u32 %r4999, 1; bra.uni $L__BB1_727; $L__BB1_722: mov.b32 %f5375, %r4998; mov.u32 %r4999, 0; mov.u32 %r5000, %r4999; bra.uni $L__BB1_727; $L__BB1_724: mov.b32 %f5375, %r544; mov.u32 %r5000, 1; mov.u32 %r4999, 0; mov.f32 %f5374, %f530; mov.u32 %r4997, %r542; mov.u32 %r4998, %r544; $L__BB1_727: setp.eq.f32 %p1049, %f523, %f5374; @%p1049 bra $L__BB1_731; bra.uni $L__BB1_728; $L__BB1_731: setp.eq.f32 %p1059, %f5375, %f524; mov.pred %p1058, -1; mov.pred %p2925, %p1058; @%p1059 bra $L__BB1_735; setp.eq.f32 %p1061, %f526, 0f7F800000; and.b32 %r2694, %r4998, 2147483647; mov.b32 %f2814, %r2694; setp.eq.f32 %p1062, %f2814, 0f7F800000; or.pred %p1063, %p1061, %p1062; mov.pred %p2925, 0; @%p1063 bra $L__BB1_735; sub.f32 %f2815, %f5375, %f524; abs.f32 %f542, %f2815; setp.le.f32 %p1065, %f542, 0f34000000; mov.pred %p2925, %p1058; @%p1065 bra $L__BB1_735; abs.f32 %f2816, %f5375; abs.f32 %f2817, %f524; setp.gt.f32 %p1066, %f2817, %f2816; selp.f32 %f2818, %f2817, %f2816, %p1066; mul.f32 %f2819, %f2818, 0f34000000; setp.le.f32 %p2925, %f542, %f2819; bra.uni $L__BB1_735; $L__BB1_728: setp.eq.f32 %p1051, %f525, 0f7F800000; and.b32 %r2693, %r4997, 2147483647; mov.b32 %f2808, %r2693; setp.eq.f32 %p1052, %f2808, 0f7F800000; or.pred %p1053, %p1051, %p1052; mov.pred %p2925, 0; @%p1053 bra $L__BB1_735; bra.uni $L__BB1_729; $L__BB1_735: cvt.u64.u32 %rd7223, %r4998; cvt.u64.u32 %rd7224, %r4997; bfi.b64 %rd1754, %rd7223, %rd7224, 32, 32; mov.b64 {%r2695, %r2696}, %rd1754; selp.u64 %rd1755, 1, 0, %p2925; mov.b32 %f544, %r2696; mov.b32 %f543, %r2695; sub.f32 %f2820, %f543, %f523; sub.f32 %f2821, %f544, %f524; mul.f32 %f2822, %f2821, %f2821; fma.rn.f32 %f2823, %f2820, %f2820, %f2822; add.f32 %f2824, %f2823, 0f00000000; sqrt.rn.f32 %f546, %f2824; setp.geu.f32 %p1067, %f546, %f5376; setp.ne.s32 %p1068, %r558, 2; and.pred %p1069, %p1068, %p1067; @%p1069 bra $L__BB1_737; add.s64 %rd11945, %rd11944, -1; st.local.u64 [%rd1747], %rd11945; st.local.v2.f32 [%rd1747+8], {%f543, %f544}; mov.b64 {%r2699, %r2700}, %rd1755; st.local.v2.u32 [%rd1747+16], {%r2699, %r4999}; st.local.v2.u32 [%rd1747+24], {%r5000, %r5001}; st.local.f32 [%rd1747+32], %f546; st.local.u32 [%rd1747+36], %rd1752; st.local.u32 [%rd1747+44], %rd1753; st.local.u32 [%rd1747+40], %rd7221; st.local.u32 [%rd1747+48], %rd7222; mov.f32 %f5376, %f546; mov.u32 %r558, %r4999; $L__BB1_737: add.s64 %rd1758, %rd11944, 1; setp.lt.u64 %p1070, %rd11944, %rd1748; mov.u64 %rd11944, %rd1758; @%p1070 bra $L__BB1_721; ld.local.u32 %rd7231, [%rd1747+36]; ld.local.u32 %rd7232, [%rd1747+40]; bfi.b64 %rd7233, %rd7232, %rd7231, 32, 32; mov.u64 %rd7230, 0; cvt.u32.u64 %r2701, %rd7233; mov.b32 %f2825, %r2701; shr.u64 %rd7234, %rd7233, 32; cvt.u32.u64 %r2702, %rd7234; mov.b32 %f2826, %r2702; ld.local.u32 %rd7235, [%rd1747+44]; ld.local.u32 %rd7236, [%rd1747+48]; bfi.b64 %rd7237, %rd7236, %rd7235, 32, 32; cvt.u32.u64 %r2703, %rd7237; shr.u64 %rd7238, %rd7237, 32; cvt.u32.u64 %r2704, %rd7238; mov.b32 %f2827, %r2703; sub.f32 %f548, %f2827, %f2825; mov.b32 %f2828, %r2704; sub.f32 %f549, %f2828, %f2826; mul.f32 %f2829, %f549, %f549; fma.rn.f32 %f2830, %f548, %f548, %f2829; add.f32 %f550, %f2830, 0f00000000; setp.leu.f32 %p1071, %f550, 0f28800000; mov.u64 %rd11946, %rd7230; mov.u64 %rd11947, %rd7230; mov.u64 %rd11948, %rd7230; @%p1071 bra $L__BB1_740; neg.f32 %f2831, %f548; sqrt.rn.f32 %f2832, %f550; div.rn.f32 %f2833, %f549, %f2832; div.rn.f32 %f2834, %f2831, %f2832; mov.b32 %r2705, %f2834; mov.b32 %r2706, %f2833; mov.u64 %rd11948, 1; mov.b64 %rd7241, {%r2706, %r2705}; shr.u64 %rd11947, %rd7241, 32; shl.b64 %rd11946, %rd7241, 32; $L__BB1_740: or.b64 %rd1765, %rd11948, %rd11946; or.b64 %rd1766, %rd7230, %rd11947; and.b64 %rd7242, %rd7230, 4294967295; xor.b64 %rd7243, %rd11948, 1; or.b64 %rd7244, %rd7243, %rd7242; setp.ne.s64 %p1072, %rd7244, 0; @%p1072 bra $L__BB1_771; mov.b64 {%r2707, %r2708}, %rd1766; mov.b64 {%r2709, %r2710}, %rd1765; mov.b32 %f551, %r2710; mov.b32 %f552, %r2707; setp.eq.s32 %p1073, %r558, 1; @%p1073 bra $L__BB1_769; bra.uni $L__BB1_742; $L__BB1_769: ld.local.u64 %rd7323, [%rd1747+8]; cvt.u32.u64 %r2731, %rd7323; mov.b32 %f2862, %r2731; shr.u64 %rd7324, %rd7323, 32; cvt.u32.u64 %r2732, %rd7324; mov.b32 %f2863, %r2732; sub.f32 %f2864, %f518, %f2862; sub.f32 %f2865, %f519, %f2863; mul.f32 %f2866, %f552, %f2865; fma.rn.f32 %f2867, %f551, %f2864, %f2866; setp.le.f32 %p2926, %f2867, 0f00000000; bra.uni $L__BB1_770; $L__BB1_776: ld.global.f32 %f2878, [%rd1746+-16]; mov.u64 %rd7344, 0; sub.f32 %f2879, %f518, %f2878; ld.global.f32 %f2880, [%rd1746+-12]; sub.f32 %f2881, %f519, %f2880; ld.global.f32 %f2882, [%rd1746+-20]; ld.global.f32 %f2883, [%rd1746+-24]; mul.f32 %f2884, %f2881, %f2882; fma.rn.f32 %f560, %f2879, %f2883, %f2884; mul.f32 %f2885, %f2879, %f2882; mul.f32 %f2886, %f2881, %f2883; sub.f32 %f561, %f2886, %f2885; mov.b32 %r2740, %f560; mov.b32 %r2741, %f561; cvt.u64.u32 %rd7345, %r2741; cvt.u64.u32 %rd7346, %r2740; bfi.b64 %rd7347, %rd7345, %rd7346, 32, 32; st.local.u64 [%rd7200], %rd7347; ld.global.u64 %rd1868, [%rd1746+-240]; setp.eq.s64 %p1094, %rd1868, 0; mov.u64 %rd7342, 2; mov.u64 %rd12102, %rd7344; mov.u64 %rd12103, %rd7342; mov.u64 %rd12104, %rd7344; @%p1094 bra $L__BB1_882; cvta.to.local.u64 %rd1869, %rd7201; mov.u32 %r2748, 0; st.local.u32 [%rd1869], %r2748; mov.u32 %r2749, -16777217; st.local.u32 [%rd1869+4], %r2749; mov.u32 %r580, 1; st.local.u32 [%rd1869+512], %r580; ld.global.u64 %rd1870, [%rd1746+-248]; ld.global.u64 %rd1871, [%rd1746+-192]; ld.global.u64 %rd1872, [%rd1746+-200]; mov.u32 %r578, 2139095039; mov.u32 %r577, 4; bra.uni $L__BB1_778; $L__BB1_887: ld.global.f32 %f627, [%rd1746+-16]; sub.f32 %f3082, %f518, %f627; ld.global.f32 %f628, [%rd1746+-12]; sub.f32 %f3083, %f519, %f628; ld.global.f32 %f3084, [%rd1746+-20]; ld.global.f32 %f629, [%rd1746+-24]; mul.f32 %f3085, %f3083, %f3084; fma.rn.f32 %f630, %f3082, %f629, %f3085; mul.f32 %f3086, %f3082, %f3084; mul.f32 %f3087, %f3083, %f629; sub.f32 %f631, %f3087, %f3086; mov.b32 %r681, %f630; mov.b32 %r682, %f631; ld.global.u64 %rd2231, [%rd1746+-216]; ld.global.u64 %rd2230, [%rd1746+-224]; sub.f32 %f3088, %f630, %f6; sub.f32 %f3089, %f631, %f6; mov.b32 %r2920, %f3088; mov.b32 %r2921, %f3089; cvt.u64.u32 %rd7734, %r2921; cvt.u64.u32 %rd7735, %r2920; add.f32 %f3090, %f6, %f630; add.f32 %f3091, %f6, %f631; mov.b32 %r2922, %f3090; mov.b32 %r2923, %f3091; cvt.u64.u32 %rd7736, %r2923; cvt.u64.u32 %rd7737, %r2922; bfi.b64 %rd7738, %rd7734, %rd7735, 32, 32; mov.b64 {%r2924, %r2925}, %rd7738; bfi.b64 %rd7739, %rd7736, %rd7737, 32, 32; mov.b64 {%r2926, %r2927}, %rd7739; cvta.to.local.u64 %rd2232, %rd7201; mov.u16 %rs529, 2; st.local.u8 [%rd2232+8], %rs529; mov.b32 %f635, %r2927; mov.b32 %f633, %r2925; mov.b32 %f634, %r2926; mov.b32 %f632, %r2924; ld.global.v2.f32 {%f3092, %f3093}, [%rd1746+-232]; div.rn.f32 %f638, %f632, %f3092; div.rn.f32 %f639, %f634, %f3092; ld.global.u64 %rd2233, [%rd1746+-256]; cvt.rn.f32.u64 %f3094, %rd2233; add.f32 %f3095, %f3094, 0fBF800000; rcp.rn.f32 %f640, %f3095; setp.lt.f32 %p1261, %f639, 0fBF000000; setp.gt.f32 %p1262, %f638, 0f3F000000; or.pred %p1263, %p1262, %p1261; @%p1263 bra $L__BB1_919; add.f32 %f3096, %f638, 0f3F000000; div.rn.f32 %f3097, %f3096, %f640; cvt.rmi.f32.f32 %f3098, %f3097; add.s64 %rd7741, %rd2233, -2; cvt.rn.f32.u64 %f3099, %rd7741; setp.gt.f32 %p1264, %f3098, 0f00000000; setp.lt.f32 %p1265, %f3098, %f3099; selp.f32 %f3100, %f3098, %f3099, %p1265; selp.f32 %f3101, %f3100, 0f00000000, %p1264; setp.gt.f32 %p1266, %f3101, 0f5F7FFFFF; max.f32 %f3102, %f3101, 0f00000000; cvt.rzi.u64.f32 %rd7742, %f3102; selp.b64 %rd2239, -1, %rd7742, %p1266; add.f32 %f3103, %f639, 0f3F000000; div.rn.f32 %f3104, %f3103, %f640; cvt.rpi.f32.f32 %f3105, %f3104; add.s64 %rd7743, %rd2233, -1; cvt.rn.f32.u64 %f3106, %rd7743; setp.gt.f32 %p1267, %f3105, 0f00000000; setp.lt.f32 %p1268, %f3105, %f3106; selp.f32 %f3107, %f3105, %f3106, %p1268; selp.f32 %f3108, %f3107, 0f00000000, %p1267; setp.gt.f32 %p1269, %f3108, 0f5F7FFFFF; max.f32 %f3109, %f3108, 0f00000000; cvt.rzi.u64.f32 %rd7744, %f3109; selp.b64 %rd2235, -1, %rd7744, %p1269; setp.ge.u64 %p1270, %rd2239, %rd2235; @%p1270 bra $L__BB1_919; div.rn.f32 %f641, %f633, %f3093; div.rn.f32 %f642, %f635, %f3093; ld.global.u64 %rd2236, [%rd1746+-240]; ld.global.u64 %rd2237, [%rd1746+-248]; ld.global.u64 %rd2238, [%rd1746+-264]; and.b32 %r2928, %r681, 2147483647; mov.b32 %f643, %r2928; and.b32 %r2929, %r682, 2147483647; mov.b32 %f644, %r2929; ld.local.v4.u32 {%r5062, %r5063, %r5064, %r2933}, [%rd2232]; mov.f32 %f5388, 0f7F7FFFFF; bra.uni $L__BB1_890; $L__BB1_925: ld.global.f32 %f669, [%rd1746+-16]; sub.f32 %f3151, %f518, %f669; ld.global.f32 %f670, [%rd1746+-12]; sub.f32 %f3152, %f519, %f670; ld.global.f32 %f671, [%rd1746+-20]; ld.global.f32 %f672, [%rd1746+-24]; mul.f32 %f3153, %f3152, %f671; fma.rn.f32 %f673, %f3151, %f672, %f3153; mul.f32 %f3154, %f3151, %f671; mul.f32 %f3155, %f3152, %f672; sub.f32 %f674, %f3155, %f3154; ld.global.u32 %rd7771, [%rd1746+-264]; ld.global.u32 %rd7772, [%rd1746+-260]; bfi.b64 %rd7773, %rd7772, %rd7771, 32, 32; cvt.u32.u64 %r2972, %rd7773; mov.b32 %f3156, %r2972; shr.u64 %rd7774, %rd7773, 32; cvt.u32.u64 %r2973, %rd7774; mov.b32 %f3157, %r2973; neg.f32 %f3158, %f3156; neg.f32 %f3159, %f3157; sub.f32 %f675, %f3158, %f673; sub.f32 %f676, %f3159, %f674; sub.f32 %f677, %f673, %f3156; sub.f32 %f678, %f674, %f3157; setp.ge.f32 %p1319, %f675, 0f00000000; selp.f32 %f3160, %f675, 0f00000000, %p1319; setp.ge.f32 %p1320, %f676, 0f00000000; selp.f32 %f3161, %f676, 0f00000000, %p1320; setp.ge.f32 %p1321, %f677, 0f00000000; selp.f32 %f3162, %f677, 0f00000000, %p1321; setp.ge.f32 %p1322, %f678, 0f00000000; selp.f32 %f3163, %f678, 0f00000000, %p1322; sub.f32 %f679, %f3160, %f3162; mov.b32 %r2974, %f679; sub.f32 %f680, %f3161, %f3163; mov.b32 %r2975, %f680; cvt.u64.u32 %rd7775, %r2975; cvt.u64.u32 %rd7776, %r2974; bfi.b64 %rd7777, %rd7775, %rd7776, 32, 32; st.local.u64 [%rd7184], %rd7777; mov.u64 %rd12118, 2; mov.u64 %rd12111, %rd1717; mov.u64 %rd12112, %rd7184; mov.u64 %rd12113, %rd7184; mov.u64 %rd12114, %rd7183; mov.u64 %rd12115, %rd7184; mov.u64 %rd12116, %rd7184; mov.u64 %rd12117, %rd7183; $L__BB1_926: setp.eq.s64 %p1323, %rd12118, 0; @%p1323 bra $L__BB1_929; add.s64 %rd12118, %rd12118, -1; add.s64 %rd7778, %rd12115, 8; setp.eq.s64 %p1324, %rd12115, %rd12111; selp.b64 %rd12111, %rd7778, %rd12111, %p1324; add.s64 %rd7779, %rd12112, 8; selp.b64 %rd12112, %rd7779, %rd12112, %p1324; add.s64 %rd7780, %rd12113, 8; selp.b64 %rd12113, %rd7780, %rd12113, %p1324; add.s64 %rd7781, %rd12114, 8; selp.b64 %rd12114, %rd7781, %rd12114, %p1324; selp.b64 %rd7782, %rd7779, %rd12115, %p1324; selp.b64 %rd7783, %rd7780, %rd12116, %p1324; selp.b64 %rd7784, %rd7781, %rd12117, %p1324; setp.eq.s64 %p1325, %rd12118, 0; add.s64 %rd7785, %rd7782, 4; add.s64 %rd7786, %rd7783, 4; add.s64 %rd7787, %rd7784, 4; selp.b64 %rd12115, %rd7782, %rd7785, %p1325; selp.b64 %rd12116, %rd7783, %rd7786, %p1325; selp.b64 %rd12117, %rd7784, %rd7787, %p1325; ld.local.f32 %f3164, [%rd7783]; setp.eq.f32 %p1326, %f3164, 0f00000000; @%p1326 bra $L__BB1_926; add.f32 %f3165, %f673, %f679; mov.b32 %r2976, %f3165; add.f32 %f3166, %f674, %f680; mov.b32 %r2977, %f3166; cvt.u64.u32 %rd7790, %r2977; cvt.u64.u32 %rd7791, %r2976; bfi.b64 %rd12121, %rd7790, %rd7791, 32, 32; mov.u64 %rd12122, 0; bra.uni $L__BB1_942; $L__BB1_929: setp.lt.f32 %p1327, %f675, %f677; mov.f32 %f5389, 0fFF7FFFFF; @%p1327 bra $L__BB1_932; bra.uni $L__BB1_930; $L__BB1_932: setp.leu.f32 %p1332, %f677, 0fFF7FFFFF; mov.pred %p2930, 0; @%p1332 bra $L__BB1_934; mov.f32 %f5389, %f677; bra.uni $L__BB1_934; $L__BB1_930: setp.leu.f32 %p1329, %f675, 0fFF7FFFFF; mov.pred %p2930, 0; @%p1329 bra $L__BB1_934; mov.pred %p2930, -1; mov.f32 %f5389, %f675; $L__BB1_934: setp.lt.f32 %p1334, %f676, %f678; @%p1334 bra $L__BB1_937; bra.uni $L__BB1_935; $L__BB1_937: setp.gt.f32 %p1336, %f678, %f5389; @%p1336 bra $L__BB1_940; bra.uni $L__BB1_938; $L__BB1_940: cvta.to.local.u64 %rd7798, %rd7201; mov.u64 %rd7799, 0; st.local.u64 [%rd7798], %rd7799; neg.f32 %f5391, %f678; mov.u64 %rd12120, %rd1726; bra.uni $L__BB1_941; $L__BB1_935: setp.leu.f32 %p1335, %f676, %f5389; @%p1335 bra $L__BB1_938; mov.u64 %rd7794, 0; st.local.u64 [%rd7202], %rd7794; mov.u64 %rd12120, %rd1726; mov.f32 %f5389, %f676; bra.uni $L__BB1_939; $L__BB1_938: mov.u64 %rd7796, 0; st.local.u64 [%rd7202], %rd7796; neg.f32 %f5391, %f5389; not.pred %p1337, %p2930; mov.u64 %rd12120, %rd7202; @%p1337 bra $L__BB1_941; $L__BB1_939: mov.f32 %f5391, %f5389; $L__BB1_941: st.local.f32 [%rd12120], %f5391; ld.local.u64 %rd7804, [%rd7202]; cvt.u32.u64 %r2978, %rd7804; mov.b32 %f3169, %r2978; shr.u64 %rd7805, %rd7804, 32; cvt.u32.u64 %r2979, %rd7805; mov.b32 %f3170, %r2979; add.f32 %f3171, %f673, %f3169; add.f32 %f3172, %f674, %f3170; mov.b32 %r2980, %f3171; mov.b32 %r2981, %f3172; cvt.u64.u32 %rd7806, %r2981; cvt.u64.u32 %rd7807, %r2980; bfi.b64 %rd12121, %rd7806, %rd7807, 32, 32; mov.u64 %rd12122, 1; $L__BB1_942: mov.u64 %rd11138, 0; cvt.u32.u64 %r2982, %rd12121; mov.b32 %f3173, %r2982; shr.u64 %rd7808, %rd12121, 32; cvt.u32.u64 %r2983, %rd7808; mov.b32 %f3174, %r2983; mul.f32 %f3175, %f672, %f3173; mul.f32 %f3176, %f671, %f3174; sub.f32 %f3177, %f3175, %f3176; mul.f32 %f3178, %f672, %f3174; fma.rn.f32 %f3179, %f671, %f3173, %f3178; add.f32 %f3180, %f669, %f3177; mov.b32 %r2984, %f3180; add.f32 %f3181, %f670, %f3179; mov.b32 %r2985, %f3181; cvt.u64.u32 %rd7809, %r2985; cvt.u64.u32 %rd7810, %r2984; bfi.b64 %rd7811, %rd7809, %rd7810, 32, 32; or.b64 %rd7812, %rd11138, %rd7811; mov.b64 {%r5065, %r5066}, %rd7812; mov.b64 {%r5067, %r2986}, %rd12122; bra.uni $L__BB1_943; $L__BB1_907: sub.f32 %f3122, %f5386, %f630; abs.f32 %f662, %f3122; setp.le.f32 %p1289, %f662, 0f34000000; @%p1289 bra $L__BB1_909; abs.f32 %f3123, %f5386; abs.f32 %f3124, %f630; setp.gt.f32 %p1291, %f3124, %f3123; selp.f32 %f3125, %f3124, %f3123, %p1291; mul.f32 %f3126, %f3125, 0f34000000; setp.gtu.f32 %p1292, %f662, %f3126; @%p1292 bra $L__BB1_913; bra.uni $L__BB1_909; $L__BB1_890: setp.gt.u64 %p1271, %rd2236, %rd2239; @%p1271 bra $L__BB1_892; bra.uni $L__BB1_891; $L__BB1_892: add.s64 %rd7745, %rd2237, %rd2239; ld.u8 %rs530, [%rd7745]; setp.eq.s16 %p1272, %rs530, 0; @%p1272 bra $L__BB1_917; cvt.rn.f32.u64 %f3111, %rd2239; fma.rn.f32 %f646, %f640, %f3111, 0fBF000000; setp.gt.u64 %p1273, %rd2233, %rd2239; @%p1273 bra $L__BB1_895; bra.uni $L__BB1_894; $L__BB1_895: shl.b64 %rd7746, %rd2239, 2; add.s64 %rd2240, %rd2238, %rd7746; ld.f32 %f647, [%rd2240]; add.s64 %rd7747, %rd2239, 1; setp.gt.u64 %p1274, %rd2233, %rd7747; @%p1274 bra $L__BB1_897; bra.uni $L__BB1_896; $L__BB1_897: ld.f32 %f648, [%rd2240+4]; setp.gt.f32 %p1275, %f648, %f642; setp.gt.f32 %p1276, %f647, %f642; and.pred %p1277, %p1276, %p1275; @%p1277 bra $L__BB1_917; setp.lt.f32 %p1278, %f647, %f641; setp.lt.f32 %p1279, %f648, %f641; and.pred %p1280, %p1278, %p1279; @%p1280 bra $L__BB1_917; mul.f32 %f3112, %f3092, %f646; mov.b32 %r2934, %f3112; mul.f32 %f651, %f3093, %f647; mov.b32 %r2935, %f651; cvt.u64.u32 %rd7748, %r2935; cvt.u64.u32 %rd7749, %r2934; add.f32 %f3113, %f640, %f646; mul.f32 %f649, %f3092, %f3113; mov.b32 %r689, %f649; mul.f32 %f3114, %f3093, %f648; mov.b32 %r2936, %f3114; cvt.u64.u32 %rd7750, %r2936; cvt.u64.u32 %rd7751, %r689; bfi.b64 %rd7752, %rd7750, %rd7751, 32, 32; bfi.b64 %rd7753, %rd7748, %rd7749, 32, 32; cvt.u32.u64 %r5060, %rd7753; mov.b32 %f5386, %r5060; sub.f32 %f652, %f649, %f5386; sub.f32 %f653, %f3114, %f651; sub.f32 %f3115, %f630, %f5386; sub.f32 %f3116, %f631, %f651; mul.f32 %f3117, %f653, %f3116; fma.rn.f32 %f654, %f652, %f3115, %f3117; mul.f32 %f3118, %f653, %f653; fma.rn.f32 %f3119, %f652, %f652, %f3118; add.f32 %f655, %f3119, 0f00000000; setp.gtu.f32 %p1281, %f654, 0f00000000; mov.b64 {%r2937, %r5061}, %rd7753; mov.b64 {%r2938, %r692}, %rd7752; @%p1281 bra $L__BB1_901; bra.uni $L__BB1_900; $L__BB1_901: setp.ltu.f32 %p1282, %f654, %f655; @%p1282 bra $L__BB1_903; bra.uni $L__BB1_902; $L__BB1_903: setp.eq.f32 %p1283, %f655, 0f00000000; @%p1283 bra $L__BB1_916; div.rn.f32 %f3120, %f654, %f655; fma.rn.f32 %f5386, %f652, %f3120, %f5386; mov.b32 %r5060, %f5386; fma.rn.f32 %f5387, %f653, %f3120, %f651; mov.b32 %r5061, %f5387; bra.uni $L__BB1_905; $L__BB1_900: mov.b32 %f5387, %r5061; bra.uni $L__BB1_905; $L__BB1_902: mov.b32 %f5387, %r692; mov.f32 %f5386, %f649; mov.u32 %r5060, %r689; mov.u32 %r5061, %r692; $L__BB1_905: setp.eq.f32 %p1284, %f630, %f5386; @%p1284 bra $L__BB1_909; bra.uni $L__BB1_906; $L__BB1_909: setp.eq.f32 %p1294, %f5387, %f631; mov.pred %p1293, -1; mov.pred %p2928, %p1293; @%p1294 bra $L__BB1_913; setp.eq.f32 %p1296, %f644, 0f7F800000; and.b32 %r2940, %r5061, 2147483647; mov.b32 %f3127, %r2940; setp.eq.f32 %p1297, %f3127, 0f7F800000; or.pred %p1298, %p1296, %p1297; mov.pred %p2928, 0; @%p1298 bra $L__BB1_913; sub.f32 %f3128, %f5387, %f631; abs.f32 %f663, %f3128; setp.le.f32 %p1300, %f663, 0f34000000; mov.pred %p2928, %p1293; @%p1300 bra $L__BB1_913; abs.f32 %f3129, %f5387; abs.f32 %f3130, %f631; setp.gt.f32 %p1301, %f3130, %f3129; selp.f32 %f3131, %f3130, %f3129, %p1301; mul.f32 %f3132, %f3131, 0f34000000; setp.le.f32 %p2928, %f663, %f3132; bra.uni $L__BB1_913; $L__BB1_906: setp.eq.f32 %p1286, %f643, 0f7F800000; and.b32 %r2939, %r5060, 2147483647; mov.b32 %f3121, %r2939; setp.eq.f32 %p1287, %f3121, 0f7F800000; or.pred %p1288, %p1286, %p1287; mov.pred %p2928, 0; @%p1288 bra $L__BB1_913; bra.uni $L__BB1_907; $L__BB1_913: cvt.u64.u32 %rd7754, %r5061; cvt.u64.u32 %rd7755, %r5060; bfi.b64 %rd2241, %rd7754, %rd7755, 32, 32; mov.b64 {%r2941, %r2942}, %rd2241; selp.u64 %rd2242, 1, 0, %p2928; mov.b32 %f3133, %r2941; sub.f32 %f3134, %f3133, %f630; mov.b32 %f3135, %r2942; sub.f32 %f3136, %f3135, %f631; mul.f32 %f3137, %f3136, %f3136; fma.rn.f32 %f3138, %f3134, %f3134, %f3137; add.f32 %f664, %f3138, 0f00000000; setp.geu.f32 %p1302, %f664, %f5388; @%p1302 bra $L__BB1_917; sqrt.rn.f32 %f3139, %f664; setp.gtu.f32 %p1303, %f3139, %f6; mov.f32 %f5388, %f664; @%p1303 bra $L__BB1_917; mov.b64 {%r5064, %r2943}, %rd2242; mov.u32 %r5062, %r2941; mov.u32 %r5063, %r2942; mov.f32 %f5388, %f664; $L__BB1_917: add.s64 %rd2239, %rd2239, 1; setp.lt.u64 %p1304, %rd2239, %rd2235; @%p1304 bra $L__BB1_890; st.local.u32 [%rd2232+8], %r5064; mov.b64 %rd7756, {%r5062, %r5063}; st.local.u64 [%rd2232], %rd7756; $L__BB1_919: cvt.u64.u32 %rd7757, %r681; cvt.u64.u32 %rd7758, %r682; bfi.b64 %rd2244, %rd7758, %rd7757, 32, 32; ld.local.v4.u32 {%r2947, %r2948, %r2949, %r2950}, [%rd2232]; mov.b64 %rd2246, {%r2949, %r2950}; mov.b64 %rd2245, {%r2947, %r2948}; mov.b32 {%rs531, %rs532}, %r2949; and.b16 %rs533, %rs531, 255; setp.eq.s16 %p1305, %rs533, 2; cvt.u64.u16 %rd7759, %rs531; and.b64 %rd7760, %rd7759, 255; selp.b64 %rd7761, 2, %rd7760, %p1305; and.b64 %rd7762, %rd2246, 4294967040; or.b64 %rd7763, %rd7762, %rd7761; mov.b64 {%r2955, %r2956}, %rd7763; mov.b32 {%rs1027, %rs534}, %r2955; and.b16 %rs535, %rs1027, 255; setp.eq.s16 %p1306, %rs535, 2; mov.u32 %r5067, 2; mov.u32 %r5065, 0; mov.u32 %r5066, %r5065; @%p1306 bra $L__BB1_943; ld.global.u8 %rs536, [%rd1746+-208]; setp.eq.s16 %p1307, %rs536, 0; shr.u64 %rd7764, %rd2245, 32; cvt.u32.u64 %r2957, %rd7764; mov.b32 %f666, %r2957; @%p1307 bra $L__BB1_924; mov.b64 {%r2958, %r2959}, %rd2244; mov.b32 %f668, %r2959; mov.b32 %f667, %r2958; mov.b64 {%r2960, %r2961}, %rd2230; mov.b64 {%r2962, %r2963}, %rd2231; ld.global.u8 %rs89, [%rd1746+-207]; mov.b32 %f3140, %r2962; setp.gt.f32 %p1309, %f667, %f3140; mov.b32 %f3141, %r2960; setp.lt.f32 %p1310, %f667, %f3141; or.pred %p1311, %p1310, %p1309; mov.pred %p2929, 0; @%p1311 bra $L__BB1_923; setp.geu.f32 %p1312, %f668, 0fFF7FFFFF; setp.leu.f32 %p1313, %f668, 0f7F7FFFFF; and.pred %p2929, %p1313, %p1312; $L__BB1_923: setp.ge.f32 %p1314, %f631, %f666; setp.le.f32 %p1315, %f631, %f666; setp.eq.s16 %p1316, %rs89, 0; selp.u32 %r2964, -1, 0, %p1314; selp.u32 %r2965, -1, 0, %p1315; selp.b32 %r2966, %r2965, %r2964, %p1316; and.b32 %r2967, %r2966, 1; setp.eq.b32 %p1317, %r2967, 1; and.pred %p1318, %p1317, %p2929; selp.u16 %rs1027, 1, 0, %p1318; $L__BB1_924: cvt.u32.u64 %r2968, %rd2245; mov.b32 %f3142, %r2968; mul.f32 %f3143, %f629, %f3142; ld.global.f32 %f3144, [%rd1746+-20]; mul.f32 %f3145, %f3144, %f666; sub.f32 %f3146, %f3143, %f3145; mul.f32 %f3147, %f3144, %f3142; fma.rn.f32 %f3148, %f629, %f666, %f3147; add.f32 %f3149, %f627, %f3146; mov.b32 %r2969, %f3149; add.f32 %f3150, %f628, %f3148; mov.b32 %r2970, %f3150; cvt.u64.u32 %rd7765, %r2970; cvt.u64.u32 %rd7766, %r2969; cvt.u64.u16 %rd7767, %rs1027; bfi.b64 %rd7768, %rd7765, %rd7766, 32, 32; and.b64 %rd7769, %rd7767, 255; mov.b64 {%r5065, %r5066}, %rd7768; mov.b64 {%r5067, %r2971}, %rd7769; bra.uni $L__BB1_943; $L__BB1_742: ld.local.u32 %r2711, [%rd1747+24]; setp.eq.s32 %p1074, %r2711, 0; @%p1074 bra $L__BB1_755; setp.ne.s32 %p1075, %r2711, 1; @%p1075 bra $L__BB1_768; add.s64 %rd1767, %rd11945, 1; or.b64 %rd7245, %rd1767, %rd1748; and.b64 %rd7246, %rd7245, -4294967296; setp.eq.s64 %p1076, %rd7246, 0; @%p1076 bra $L__BB1_746; rem.u64 %rd11949, %rd1767, %rd1748; bra.uni $L__BB1_747; $L__BB1_755: setp.eq.s64 %p1083, %rd11945, 0; selp.b64 %rd1814, %rd1748, %rd11945, %p1083; add.s64 %rd7285, %rd1814, -1; setp.gt.u64 %p1084, %rd1748, %rd7285; @%p1084 bra $L__BB1_757; bra.uni $L__BB1_756; $L__BB1_757: shl.b64 %rd7286, %rd1814, 3; add.s64 %rd7287, %rd1749, %rd7286; ld.u32 %rd7288, [%rd7287+-8]; ld.u32 %rd7289, [%rd7287+-4]; bfi.b64 %rd1815, %rd7289, %rd7288, 32, 32; or.b64 %rd7290, %rd1814, %rd1748; and.b64 %rd7291, %rd7290, -4294967296; setp.eq.s64 %p1085, %rd7291, 0; @%p1085 bra $L__BB1_759; rem.u64 %rd11966, %rd1814, %rd1748; bra.uni $L__BB1_760; $L__BB1_873: ld.u32 %r2897, [%rd1880+76]; cvt.u64.u32 %rd7673, %r2897; setp.le.u64 %p1251, %rd1871, %rd7673; mul.wide.u32 %rd7674, %r2897, 12; add.s64 %rd7675, %rd1872, %rd7674; setp.eq.s64 %p1252, %rd7675, 0; or.pred %p1253, %p1251, %p1252; selp.b32 %r575, %r575, %r5021, %p1253; selp.b32 %r574, %r574, %r5020, %p1253; selp.b32 %r573, %r573, %r5019, %p1253; selp.b32 %r577, %r577, %r5034, %p1253; selp.b32 %r578, %r578, %r627, %p1253; $L__BB1_778: mov.b32 %f562, %r578; $L__BB1_779: mov.u32 %r579, %r580; setp.eq.s32 %p1095, %r579, 0; @%p1095 bra $L__BB1_880; cvt.u64.u32 %rd7354, %r579; add.s64 %rd7355, %rd7354, -1; cvt.u32.u64 %r580, %rd7355; st.local.u32 [%rd1869+512], %r580; mul.wide.u32 %rd7356, %r579, 8; add.s64 %rd7357, %rd1869, %rd7356; ld.local.u32 %rd1878, [%rd7357+-4]; ld.local.u32 %rd7358, [%rd7357+-8]; shl.b64 %rd7359, %rd7358, 32; or.b64 %rd1877, %rd7359, 1; mov.b64 {%r2753, %r2754}, %rd1878; mov.b32 %f2887, %r2753; neg.f32 %f2888, %f2887; setp.le.f32 %p1096, %f562, %f2888; @%p1096 bra $L__BB1_779; mov.b64 {%r2755, %r2756}, %rd1877; cvt.u64.u32 %rd1879, %r2756; setp.gt.u64 %p1097, %rd1868, %rd1879; @%p1097 bra $L__BB1_783; bra.uni $L__BB1_782; $L__BB1_783: mul.lo.s64 %rd7360, %rd1879, 96; add.s64 %rd1880, %rd1870, %rd7360; ld.u8 %rs471, [%rd1880+88]; and.b16 %rs472, %rs471, 1; setp.eq.b16 %p1099, %rs472, 1; mov.pred %p2927, 0; xor.pred %p1100, %p1099, %p2927; not.pred %p1101, %p1100; @%p1101 bra $L__BB1_785; ld.v4.u32 {%r2757, %r2758, %r2759, %r2760}, [%rd1880+64]; cvt.u64.u32 %rd7361, %r2757; setp.gt.u64 %p1103, %rd1871, %rd7361; mul.wide.u32 %rd7362, %r2757, 12; add.s64 %rd7363, %rd1872, %rd7362; selp.b64 %rd7364, %rd7363, 0, %p1103; setp.eq.s64 %p1104, %rd7364, 0; add.s64 %rd7365, %rd7364, 8; selp.b64 %rd11987, 0, %rd7365, %p1104; cvt.u64.u32 %rd7366, %r2758; setp.gt.u64 %p1105, %rd1871, %rd7366; mul.wide.u32 %rd7367, %r2758, 12; add.s64 %rd7368, %rd1872, %rd7367; selp.b64 %rd7369, %rd7368, 0, %p1105; setp.eq.s64 %p1106, %rd7369, 0; add.s64 %rd7370, %rd7369, 8; selp.b64 %rd11986, 0, %rd7370, %p1106; ld.u32 %r2764, [%rd1880+72]; cvt.u64.u32 %rd7371, %r2764; setp.gt.u64 %p1107, %rd1871, %rd7371; mul.wide.u32 %rd7372, %r2764, 12; add.s64 %rd7373, %rd1872, %rd7372; selp.b64 %rd7374, %rd7373, 0, %p1107; setp.eq.s64 %p1108, %rd7374, 0; add.s64 %rd7375, %rd7374, 8; selp.b64 %rd11985, 0, %rd7375, %p1108; cvt.u64.u32 %rd7376, %r2760; setp.gt.u64 %p1109, %rd1871, %rd7376; mul.wide.u32 %rd7377, %r2760, 12; add.s64 %rd7378, %rd1872, %rd7377; selp.b64 %rd7379, %rd7378, 0, %p1109; setp.eq.s64 %p1110, %rd7379, 0; add.s64 %rd7380, %rd7379, 8; selp.b64 %rd11984, 0, %rd7380, %p1110; mov.pred %p2927, -1; $L__BB1_785: ld.v4.f32 {%f2889, %f2890, %f2891, %f2892}, [%rd1880]; sub.f32 %f2897, %f2889, %f560; sub.f32 %f2898, %f2890, %f560; sub.f32 %f2899, %f2891, %f560; sub.f32 %f2900, %f2892, %f560; ld.v4.f32 {%f2901, %f2902, %f2903, %f2904}, [%rd1880+16]; sub.f32 %f2909, %f2901, %f561; sub.f32 %f2910, %f2902, %f561; sub.f32 %f2911, %f2903, %f561; sub.f32 %f2912, %f2904, %f561; ld.v4.f32 {%f2913, %f2914, %f2915, %f2916}, [%rd1880+32]; sub.f32 %f2921, %f560, %f2913; sub.f32 %f2922, %f560, %f2914; sub.f32 %f2923, %f560, %f2915; sub.f32 %f2924, %f560, %f2916; ld.v4.f32 {%f2925, %f2926, %f2927, %f2928}, [%rd1880+48]; sub.f32 %f2933, %f561, %f2925; sub.f32 %f2934, %f561, %f2926; sub.f32 %f2935, %f561, %f2927; sub.f32 %f2936, %f561, %f2928; setp.ge.f32 %p1111, %f2897, %f2921; selp.f32 %f2937, %f2897, %f2921, %p1111; setp.ge.f32 %p1112, %f2898, %f2922; selp.f32 %f2938, %f2898, %f2922, %p1112; setp.ge.f32 %p1113, %f2899, %f2923; selp.f32 %f2939, %f2899, %f2923, %p1113; setp.ge.f32 %p1114, %f2900, %f2924; selp.f32 %f2940, %f2900, %f2924, %p1114; setp.ge.f32 %p1115, %f2909, %f2933; selp.f32 %f2941, %f2909, %f2933, %p1115; setp.ge.f32 %p1116, %f2910, %f2934; selp.f32 %f2942, %f2910, %f2934, %p1116; setp.ge.f32 %p1117, %f2911, %f2935; selp.f32 %f2943, %f2911, %f2935, %p1117; setp.ge.f32 %p1118, %f2912, %f2936; selp.f32 %f2944, %f2912, %f2936, %p1118; setp.ge.f32 %p1119, %f2937, 0f00000000; selp.f32 %f2945, %f2937, 0f00000000, %p1119; setp.ge.f32 %p1120, %f2938, 0f00000000; selp.f32 %f2946, %f2938, 0f00000000, %p1120; setp.ge.f32 %p1121, %f2939, 0f00000000; selp.f32 %f2947, %f2939, 0f00000000, %p1121; setp.ge.f32 %p1122, %f2940, 0f00000000; selp.f32 %f2948, %f2940, 0f00000000, %p1122; mov.b32 %r2765, %f2945; mov.b32 %r2766, %f2946; mov.b32 %r2767, %f2947; mov.b32 %r2768, %f2948; cvt.u64.u32 %rd7381, %r2768; cvt.u64.u32 %rd7382, %r2766; cvt.u64.u32 %rd7383, %r2765; cvt.u64.u32 %rd7384, %r2767; bfi.b64 %rd7385, %rd7381, %rd7384, 32, 32; bfi.b64 %rd7386, %rd7382, %rd7383, 32, 32; setp.ge.f32 %p1123, %f2941, 0f00000000; selp.f32 %f2949, %f2941, 0f00000000, %p1123; setp.ge.f32 %p1124, %f2942, 0f00000000; selp.f32 %f2950, %f2942, 0f00000000, %p1124; setp.ge.f32 %p1125, %f2943, 0f00000000; selp.f32 %f2951, %f2943, 0f00000000, %p1125; setp.ge.f32 %p1126, %f2944, 0f00000000; selp.f32 %f2952, %f2944, 0f00000000, %p1126; mov.b32 %r2769, %f2949; mov.b32 %r2770, %f2950; mov.b32 %r2771, %f2951; mov.b32 %r2772, %f2952; cvt.u64.u32 %rd7387, %r2772; cvt.u64.u32 %rd7388, %r2770; cvt.u64.u32 %rd7389, %r2769; cvt.u64.u32 %rd7390, %r2771; bfi.b64 %rd7391, %rd7387, %rd7390, 32, 32; bfi.b64 %rd7392, %rd7388, %rd7389, 32, 32; mov.b64 {%r2773, %r2774}, %rd7386; mov.b64 {%r2775, %r2776}, %rd7385; cvt.u64.u32 %rd7393, %r2776; cvt.u64.u32 %rd7394, %r2774; cvt.u64.u32 %rd7395, %r2775; bfi.b64 %rd7396, %rd7393, %rd7395, 32, 32; mov.b64 {%r2777, %r2778}, %rd7396; bfi.b64 %rd7397, %rd7394, %rd7383, 32, 32; mov.b64 {%r2779, %r2780}, %rd7397; mov.b32 %f2953, %r2779; mov.b32 %f2954, %r2780; mov.b32 %f2955, %r2777; mov.b32 %f2956, %r2778; mov.b32 %f2957, %r2773; mov.b32 %f2958, %r2774; mov.b32 %f2959, %r2775; mov.b32 %f2960, %r2776; mov.b64 {%r2781, %r2782}, %rd7392; mov.b64 {%r2783, %r2784}, %rd7391; cvt.u64.u32 %rd7398, %r2784; cvt.u64.u32 %rd7399, %r2782; cvt.u64.u32 %rd7400, %r2783; bfi.b64 %rd7401, %rd7398, %rd7400, 32, 32; mov.b64 {%r2785, %r2786}, %rd7401; bfi.b64 %rd7402, %rd7399, %rd7389, 32, 32; mov.b64 {%r2787, %r2788}, %rd7402; mov.b32 %f2961, %r2787; mov.b32 %f2962, %r2788; mov.b32 %f2963, %r2785; mov.b32 %f2964, %r2786; mov.b32 %f2965, %r2781; mov.b32 %f2966, %r2782; mov.b32 %f2967, %r2783; mov.b32 %f2968, %r2784; mul.f32 %f2969, %f2965, %f2961; mul.f32 %f2970, %f2966, %f2962; mul.f32 %f2971, %f2967, %f2963; mul.f32 %f2972, %f2968, %f2964; fma.rn.f32 %f2973, %f2957, %f2953, %f2969; fma.rn.f32 %f2974, %f2958, %f2954, %f2970; fma.rn.f32 %f2975, %f2959, %f2955, %f2971; fma.rn.f32 %f2976, %f2960, %f2956, %f2972; add.f32 %f2977, %f2973, 0f00000000; add.f32 %f2978, %f2974, 0f00000000; add.f32 %f2979, %f2975, 0f00000000; add.f32 %f2980, %f2976, 0f00000000; sqrt.rn.f32 %f2981, %f2977; sqrt.rn.f32 %f2982, %f2978; sqrt.rn.f32 %f2983, %f2979; sqrt.rn.f32 %f2984, %f2980; mov.b32 %r2789, %f2981; mov.b32 %r2790, %f2982; mov.b32 %r2791, %f2983; mov.b32 %r2792, %f2984; cvt.u64.u32 %rd7403, %r2792; cvt.u64.u32 %rd7404, %r2790; cvt.u64.u32 %rd7405, %r2789; cvt.u64.u32 %rd7406, %r2791; bfi.b64 %rd12093, %rd7403, %rd7406, 32, 32; mov.b64 {%r2793, %r2794}, %rd12093; bfi.b64 %rd12092, %rd7404, %rd7405, 32, 32; mov.b64 {%r2795, %r2796}, %rd12092; mov.b32 %f2985, %r2795; mov.b32 %f2986, %r2796; mov.b32 %f2987, %r2793; mov.b32 %f2988, %r2794; setp.lt.f32 %p1127, %f2985, %f562; setp.lt.f32 %p1128, %f2986, %f562; setp.lt.f32 %p1129, %f2987, %f562; setp.lt.f32 %p1130, %f2988, %f562; selp.u32 %r2797, 1, 0, %p1127; selp.u32 %r2798, -1, 0, %p1128; bfi.b32 %r2799, %r2798, %r2797, 8, 1; selp.u32 %r2800, -1, 0, %p1129; bfi.b32 %r2801, %r2800, %r2799, 16, 1; selp.u32 %r2802, -1, 0, %p1130; bfi.b32 %r2803, %r2802, %r2801, 24, 1; cvt.u64.u32 %rd7407, %r2803; mov.b64 {%r2804, %r2805}, %rd7407; mov.b32 {%rs473, %rs474}, %r2804; and.b16 %rs475, %rs473, 1; shr.u16 %rs476, %rs473, 7; and.b16 %rs477, %rs476, 2; or.b16 %rs478, %rs477, %rs475; shl.b16 %rs479, %rs474, 2; and.b16 %rs480, %rs479, 4; or.b16 %rs481, %rs478, %rs480; shr.u16 %rs482, %rs474, 5; and.b16 %rs483, %rs482, 8; or.b16 %rs484, %rs481, %rs483; cvt.u64.u16 %rd1891, %rs484; @%p2927 bra $L__BB1_787; bra.uni $L__BB1_786; $L__BB1_787: mov.u64 %rd7408, 1; st.local.v2.u64 [%rd8], {%rd11987, %rd11986}; st.local.v2.u64 [%rd8+16], {%rd11985, %rd11984}; mov.f32 %f2989, 0f00000000; st.local.v4.f32 [%rd24], {%f2989, %f2989, %f2989, %f2989}; mov.u32 %r2816, 4; st.local.u32 [%rd7+16], %r2816; st.local.u32 [%rd7+52], %r2816; st.local.u32 [%rd7+88], %r2816; st.local.u32 [%rd7+124], %r2816; mov.u64 %rd1895, %rd7408; $L__BB1_788: add.s64 %rd7412, %rd1895, -1; cvt.u32.u64 %r2817, %rd7412; shl.b64 %rd7414, %rd7408, %r2817; and.b64 %rd7415, %rd7414, %rd1891; setp.eq.s64 %p1131, %rd7415, 0; @%p1131 bra $L__BB1_841; shl.b64 %rd7416, %rd1895, 3; add.s64 %rd7417, %rd8, %rd7416; ld.local.u64 %rd1896, [%rd7417+-8]; setp.eq.s64 %p1132, %rd1896, 0; @%p1132 bra $L__BB1_841; ld.u32 %r581, [%rd1896]; cvt.u64.u32 %rd1897, %r581; ld.global.u64 %rd7418, [%rd1746+-160]; setp.gt.u64 %p1133, %rd7418, %rd1897; @%p1133 bra $L__BB1_792; bra.uni $L__BB1_791; $L__BB1_792: ld.global.u64 %rd7419, [%rd1746+-168]; mul.lo.s64 %rd7420, %rd1897, 12; add.s64 %rd1898, %rd7419, %rd7420; ld.u32 %rd1899, [%rd1898+8]; ld.u32 %rd1900, [%rd1898]; ld.global.u64 %rd1901, [%rd1746+-176]; setp.gt.u64 %p1134, %rd1901, %rd1900; @%p1134 bra $L__BB1_794; bra.uni $L__BB1_793; $L__BB1_794: ld.global.u64 %rd1902, [%rd1746+-184]; shl.b64 %rd7421, %rd1900, 3; add.s64 %rd7422, %rd1902, %rd7421; ld.u32 %rd7423, [%rd7422]; ld.u32 %rd7424, [%rd7422+4]; bfi.b64 %rd1903, %rd7424, %rd7423, 32, 32; ld.u32 %rd1904, [%rd1898+4]; setp.gt.u64 %p1135, %rd1901, %rd1904; @%p1135 bra $L__BB1_796; bra.uni $L__BB1_795; $L__BB1_796: setp.gt.u64 %p1136, %rd1901, %rd1899; @%p1136 bra $L__BB1_798; bra.uni $L__BB1_797; $L__BB1_798: shl.b64 %rd7425, %rd1904, 3; add.s64 %rd7426, %rd1902, %rd7425; shl.b64 %rd7427, %rd1899, 3; add.s64 %rd7428, %rd1902, %rd7427; cvt.u32.u64 %r2818, %rd1903; mov.b32 %f563, %r2818; shr.u64 %rd7429, %rd1903, 32; cvt.u32.u64 %r2819, %rd7429; mov.b32 %f564, %r2819; ld.u32 %rd7430, [%rd7426]; ld.u32 %rd7431, [%rd7426+4]; bfi.b64 %rd1905, %rd7431, %rd7430, 32, 32; cvt.u32.u64 %r2820, %rd1905; shr.u64 %rd7432, %rd1905, 32; cvt.u32.u64 %r2821, %rd7432; mov.b32 %f565, %r2820; sub.f32 %f566, %f565, %f563; mov.b32 %f5380, %r2821; sub.f32 %f568, %f5380, %f564; ld.u32 %rd7433, [%rd7428]; ld.u32 %rd7434, [%rd7428+4]; bfi.b64 %rd1906, %rd7434, %rd7433, 32, 32; cvt.u32.u64 %r2822, %rd1906; shr.u64 %rd7435, %rd1906, 32; cvt.u32.u64 %r2823, %rd7435; mov.b32 %f569, %r2822; sub.f32 %f570, %f569, %f563; mov.b32 %f571, %r2823; sub.f32 %f572, %f571, %f564; sub.f32 %f573, %f560, %f563; sub.f32 %f574, %f561, %f564; mul.f32 %f2990, %f568, %f574; fma.rn.f32 %f575, %f566, %f573, %f2990; mul.f32 %f2991, %f572, %f574; fma.rn.f32 %f576, %f570, %f573, %f2991; setp.le.f32 %p1137, %f575, 0f00000000; setp.le.f32 %p1138, %f576, 0f00000000; and.pred %p1139, %p1137, %p1138; @%p1139 bra $L__BB1_836; bra.uni $L__BB1_799; $L__BB1_836: add.u64 %rd12078, %SP, 552; cvta.to.local.u64 %rd12076, %rd12078; add.u64 %rd12084, %SP, 0; cvta.to.local.u64 %rd12082, %rd12084; st.local.u64 [%rd12082], %rd1903; mov.u64 %rd12089, 2; mov.u64 %rd12075, %rd1724; mov.u64 %rd12077, %rd12076; mov.u64 %rd12079, %rd12076; mov.u64 %rd12080, %rd12076; mov.u64 %rd12081, %rd12078; mov.u64 %rd12083, %rd12082; mov.u64 %rd12085, %rd12082; mov.u64 %rd12086, %rd12082; mov.u64 %rd12087, %rd12084; mov.u64 %rd12088, %rd1718; $L__BB1_837: setp.eq.s64 %p1192, %rd12089, 0; mov.u64 %rd12090, 1; @%p1192 bra $L__BB1_839; add.s64 %rd12089, %rd12089, -1; add.s64 %rd7580, %rd12076, 8; setp.eq.s64 %p1193, %rd12079, %rd12075; selp.b64 %rd7581, %rd7580, %rd12079, %p1193; add.s64 %rd7582, %rd12077, 8; selp.b64 %rd7583, %rd7582, %rd12080, %p1193; add.s64 %rd7584, %rd12078, 8; selp.b64 %rd7585, %rd7584, %rd12081, %p1193; mov.u64 %rd12090, 0; setp.eq.s64 %p1194, %rd12089, 0; add.s64 %rd7586, %rd7581, 4; add.s64 %rd7587, %rd7583, 4; add.s64 %rd7588, %rd7585, 4; selp.b64 %rd2132, %rd7581, %rd7586, %p1194; selp.b64 %rd12080, %rd7583, %rd7587, %p1194; selp.b64 %rd12081, %rd7585, %rd7588, %p1194; selp.b64 %rd12076, %rd7580, %rd12076, %p1193; selp.b64 %rd12077, %rd7582, %rd12077, %p1193; selp.b64 %rd12078, %rd7584, %rd12078, %p1193; add.s64 %rd7589, %rd12079, 8; selp.b64 %rd12075, %rd7589, %rd12075, %p1193; add.s64 %rd7590, %rd12085, 8; setp.eq.s64 %p1195, %rd12082, %rd12088; selp.b64 %rd7591, %rd7590, %rd12082, %p1195; add.s64 %rd7592, %rd12086, 8; selp.b64 %rd7593, %rd7592, %rd12083, %p1195; add.s64 %rd7594, %rd12087, 8; selp.b64 %rd7595, %rd7594, %rd12084, %p1195; selp.b64 %rd12085, %rd7590, %rd12085, %p1195; selp.b64 %rd12086, %rd7592, %rd12086, %p1195; selp.b64 %rd12087, %rd7594, %rd12087, %p1195; add.s64 %rd7596, %rd12082, 8; selp.b64 %rd12088, %rd7596, %rd12088, %p1195; add.s64 %rd7597, %rd7591, 4; add.s64 %rd7598, %rd7593, 4; add.s64 %rd7599, %rd7595, 4; selp.b64 %rd12082, %rd7591, %rd7597, %p1194; selp.b64 %rd12083, %rd7593, %rd7598, %p1194; selp.b64 %rd12084, %rd7595, %rd7599, %p1194; ld.local.f32 %f3057, [%rd7593]; ld.local.f32 %f3058, [%rd7583]; setp.eq.f32 %p1196, %f3058, %f3057; mov.u64 %rd12079, %rd2132; @%p1196 bra $L__BB1_837; $L__BB1_839: mov.u64 %rd11115, 0; or.b64 %rd7601, %rd11115, %rd1903; mov.b64 {%r2865, %r2866}, %rd7601; mov.b64 {%r2867, %r2868}, %rd12090; cvt.u32.u64 %r2870, %rd11115; or.b32 %r5016, %r2870, %r2818; mov.u32 %r5017, 0; mov.b32 %f5384, %r2866; mov.b32 {%rs1026, %rs503}, %r2867; mov.u32 %r5018, %r5017; bra.uni $L__BB1_840; $L__BB1_799: sub.f32 %f577, %f560, %f565; sub.f32 %f578, %f561, %f5380; mul.f32 %f2992, %f568, %f578; fma.rn.f32 %f579, %f566, %f577, %f2992; mul.f32 %f2993, %f572, %f578; fma.rn.f32 %f580, %f570, %f577, %f2993; setp.ge.f32 %p1140, %f579, 0f00000000; setp.le.f32 %p1141, %f580, %f579; and.pred %p1142, %p1141, %p1140; @%p1142 bra $L__BB1_832; bra.uni $L__BB1_800; $L__BB1_832: add.u64 %rd12062, %SP, 552; cvta.to.local.u64 %rd12060, %rd12062; add.u64 %rd12068, %SP, 0; cvta.to.local.u64 %rd12066, %rd12068; st.local.u64 [%rd12066], %rd1905; mov.u64 %rd12073, 2; mov.u64 %rd12059, %rd1724; mov.u64 %rd12061, %rd12060; mov.u64 %rd12063, %rd12060; mov.u64 %rd12064, %rd12060; mov.u64 %rd12065, %rd12062; mov.u64 %rd12067, %rd12066; mov.u64 %rd12069, %rd12066; mov.u64 %rd12070, %rd12066; mov.u64 %rd12071, %rd12068; mov.u64 %rd12072, %rd1719; $L__BB1_833: setp.eq.s64 %p1187, %rd12073, 0; mov.u64 %rd12074, 1; @%p1187 bra $L__BB1_835; add.s64 %rd12073, %rd12073, -1; add.s64 %rd7553, %rd12060, 8; setp.eq.s64 %p1188, %rd12063, %rd12059; selp.b64 %rd7554, %rd7553, %rd12063, %p1188; add.s64 %rd7555, %rd12061, 8; selp.b64 %rd7556, %rd7555, %rd12064, %p1188; add.s64 %rd7557, %rd12062, 8; selp.b64 %rd7558, %rd7557, %rd12065, %p1188; mov.u64 %rd12074, 0; setp.eq.s64 %p1189, %rd12073, 0; add.s64 %rd7559, %rd7554, 4; add.s64 %rd7560, %rd7556, 4; add.s64 %rd7561, %rd7558, 4; selp.b64 %rd2094, %rd7554, %rd7559, %p1189; selp.b64 %rd12064, %rd7556, %rd7560, %p1189; selp.b64 %rd12065, %rd7558, %rd7561, %p1189; selp.b64 %rd12060, %rd7553, %rd12060, %p1188; selp.b64 %rd12061, %rd7555, %rd12061, %p1188; selp.b64 %rd12062, %rd7557, %rd12062, %p1188; add.s64 %rd7562, %rd12063, 8; selp.b64 %rd12059, %rd7562, %rd12059, %p1188; add.s64 %rd7563, %rd12069, 8; setp.eq.s64 %p1190, %rd12066, %rd12072; selp.b64 %rd7564, %rd7563, %rd12066, %p1190; add.s64 %rd7565, %rd12070, 8; selp.b64 %rd7566, %rd7565, %rd12067, %p1190; add.s64 %rd7567, %rd12071, 8; selp.b64 %rd7568, %rd7567, %rd12068, %p1190; selp.b64 %rd12069, %rd7563, %rd12069, %p1190; selp.b64 %rd12070, %rd7565, %rd12070, %p1190; selp.b64 %rd12071, %rd7567, %rd12071, %p1190; add.s64 %rd7569, %rd12066, 8; selp.b64 %rd12072, %rd7569, %rd12072, %p1190; add.s64 %rd7570, %rd7564, 4; add.s64 %rd7571, %rd7566, 4; add.s64 %rd7572, %rd7568, 4; selp.b64 %rd12066, %rd7564, %rd7570, %p1189; selp.b64 %rd12067, %rd7566, %rd7571, %p1189; selp.b64 %rd12068, %rd7568, %rd7572, %p1189; ld.local.f32 %f3055, [%rd7566]; ld.local.f32 %f3056, [%rd7556]; setp.eq.f32 %p1191, %f3056, %f3055; mov.u64 %rd12063, %rd2094; @%p1191 bra $L__BB1_833; $L__BB1_835: mov.u64 %rd11114, 0; or.b64 %rd7574, %rd11114, %rd1905; mov.b64 {%r2857, %r2858}, %rd7574; mov.b64 {%r2859, %r2860}, %rd12074; cvt.u32.u64 %r2862, %rd11114; or.b32 %r5016, %r2862, %r2820; mov.u32 %r5017, 0; mov.b32 %f5384, %r2858; mov.u32 %r5018, 1; mov.b32 {%rs1026, %rs499}, %r2859; bra.uni $L__BB1_840; $L__BB1_800: sub.f32 %f581, %f560, %f569; sub.f32 %f582, %f561, %f571; mul.f32 %f2994, %f568, %f582; fma.rn.f32 %f583, %f566, %f581, %f2994; mul.f32 %f2995, %f572, %f582; fma.rn.f32 %f584, %f570, %f581, %f2995; setp.ge.f32 %p1143, %f584, 0f00000000; setp.le.f32 %p1144, %f583, %f584; and.pred %p1145, %p1144, %p1143; @%p1145 bra $L__BB1_828; bra.uni $L__BB1_801; $L__BB1_828: add.u64 %rd12046, %SP, 552; cvta.to.local.u64 %rd12044, %rd12046; add.u64 %rd12052, %SP, 0; cvta.to.local.u64 %rd12050, %rd12052; st.local.u64 [%rd12050], %rd1906; mov.u64 %rd12057, 2; mov.u64 %rd12043, %rd1724; mov.u64 %rd12045, %rd12044; mov.u64 %rd12047, %rd12044; mov.u64 %rd12048, %rd12044; mov.u64 %rd12049, %rd12046; mov.u64 %rd12051, %rd12050; mov.u64 %rd12053, %rd12050; mov.u64 %rd12054, %rd12050; mov.u64 %rd12055, %rd12052; mov.u64 %rd12056, %rd1720; $L__BB1_829: setp.eq.s64 %p1182, %rd12057, 0; mov.u64 %rd12058, 1; @%p1182 bra $L__BB1_831; add.s64 %rd12057, %rd12057, -1; add.s64 %rd7526, %rd12044, 8; setp.eq.s64 %p1183, %rd12047, %rd12043; selp.b64 %rd7527, %rd7526, %rd12047, %p1183; add.s64 %rd7528, %rd12045, 8; selp.b64 %rd7529, %rd7528, %rd12048, %p1183; add.s64 %rd7530, %rd12046, 8; selp.b64 %rd7531, %rd7530, %rd12049, %p1183; mov.u64 %rd12058, 0; setp.eq.s64 %p1184, %rd12057, 0; add.s64 %rd7532, %rd7527, 4; add.s64 %rd7533, %rd7529, 4; add.s64 %rd7534, %rd7531, 4; selp.b64 %rd2056, %rd7527, %rd7532, %p1184; selp.b64 %rd12048, %rd7529, %rd7533, %p1184; selp.b64 %rd12049, %rd7531, %rd7534, %p1184; selp.b64 %rd12044, %rd7526, %rd12044, %p1183; selp.b64 %rd12045, %rd7528, %rd12045, %p1183; selp.b64 %rd12046, %rd7530, %rd12046, %p1183; add.s64 %rd7535, %rd12047, 8; selp.b64 %rd12043, %rd7535, %rd12043, %p1183; add.s64 %rd7536, %rd12053, 8; setp.eq.s64 %p1185, %rd12050, %rd12056; selp.b64 %rd7537, %rd7536, %rd12050, %p1185; add.s64 %rd7538, %rd12054, 8; selp.b64 %rd7539, %rd7538, %rd12051, %p1185; add.s64 %rd7540, %rd12055, 8; selp.b64 %rd7541, %rd7540, %rd12052, %p1185; selp.b64 %rd12053, %rd7536, %rd12053, %p1185; selp.b64 %rd12054, %rd7538, %rd12054, %p1185; selp.b64 %rd12055, %rd7540, %rd12055, %p1185; add.s64 %rd7542, %rd12050, 8; selp.b64 %rd12056, %rd7542, %rd12056, %p1185; add.s64 %rd7543, %rd7537, 4; add.s64 %rd7544, %rd7539, 4; add.s64 %rd7545, %rd7541, 4; selp.b64 %rd12050, %rd7537, %rd7543, %p1184; selp.b64 %rd12051, %rd7539, %rd7544, %p1184; selp.b64 %rd12052, %rd7541, %rd7545, %p1184; ld.local.f32 %f3053, [%rd7539]; ld.local.f32 %f3054, [%rd7529]; setp.eq.f32 %p1186, %f3054, %f3053; mov.u64 %rd12047, %rd2056; @%p1186 bra $L__BB1_829; $L__BB1_831: mov.u64 %rd11113, 0; or.b64 %rd7547, %rd11113, %rd1906; mov.b64 {%r2849, %r2850}, %rd7547; mov.b64 {%r2851, %r2852}, %rd12058; cvt.u32.u64 %r2854, %rd11113; or.b32 %r5016, %r2854, %r2822; mov.u32 %r5017, 0; mov.b32 %f5384, %r2850; mov.b32 {%rs1026, %rs495}, %r2851; mov.u32 %r5018, 2; bra.uni $L__BB1_840; $L__BB1_801: sub.f32 %f585, %f569, %f565; sub.f32 %f586, %f571, %f5380; mul.f32 %f2996, %f568, %f570; mul.f32 %f2997, %f566, %f572; sub.f32 %f587, %f2997, %f2996; mul.f32 %f2998, %f568, %f573; mul.f32 %f2999, %f566, %f574; sub.f32 %f3000, %f2999, %f2998; mul.f32 %f3001, %f587, %f3000; setp.lt.f32 %p1146, %f3001, 0f00000000; setp.ge.f32 %p1147, %f575, 0f00000000; and.pred %p1148, %p1147, %p1146; setp.le.f32 %p1149, %f579, 0f00000000; and.pred %p1150, %p1149, %p1148; mov.u16 %rs1025, 0; @%p1150 bra $L__BB1_804; mul.f32 %f3002, %f570, %f582; mul.f32 %f3003, %f581, %f572; sub.f32 %f3004, %f3002, %f3003; mul.f32 %f3005, %f587, %f3004; setp.gt.f32 %p1151, %f3005, 0f80000000; setp.ge.f32 %p1152, %f576, 0f00000000; and.pred %p1153, %p1152, %p1151; setp.le.f32 %p1154, %f584, 0f00000000; and.pred %p1155, %p1154, %p1153; mov.u16 %rs1025, 1; @%p1155 bra $L__BB1_804; mul.f32 %f3006, %f585, %f578; mul.f32 %f3007, %f577, %f586; sub.f32 %f3008, %f3006, %f3007; mul.f32 %f3009, %f587, %f3008; setp.lt.f32 %p1156, %f3009, 0f00000000; sub.f32 %f3010, %f580, %f579; setp.ge.f32 %p1157, %f3010, 0f00000000; and.pred %p1158, %p1157, %p1156; sub.f32 %f3011, %f583, %f584; setp.ge.f32 %p1159, %f3011, 0f00000000; and.pred %p1160, %p1159, %p1158; selp.b16 %rs1025, 2, 3, %p1160; $L__BB1_804: mul.f32 %f3012, %f568, %f568; fma.rn.f32 %f3013, %f566, %f566, %f3012; add.f32 %f588, %f3013, 0f00000000; mul.f32 %f3014, %f572, %f572; fma.rn.f32 %f3015, %f570, %f570, %f3014; add.f32 %f589, %f3015, 0f00000000; mul.f32 %f3016, %f586, %f586; fma.rn.f32 %f3017, %f585, %f585, %f3016; add.f32 %f590, %f3017, 0f00000000; setp.eq.s16 %p1161, %rs1025, 1; @%p1161 bra $L__BB1_819; setp.eq.s16 %p1162, %rs1025, 2; @%p1162 bra $L__BB1_815; setp.ne.s16 %p1163, %rs1025, 3; @%p1163 bra $L__BB1_823; sub.f32 %f3018, %f575, %f579; div.rn.f32 %f591, %f575, %f3018; sub.f32 %f3019, %f576, %f584; div.rn.f32 %f592, %f576, %f3019; sub.f32 %f3020, %f580, %f579; add.f32 %f3021, %f583, %f3020; sub.f32 %f3022, %f3021, %f584; div.rn.f32 %f5382, %f3020, %f3022; mul.f32 %f3023, %f574, %f574; fma.rn.f32 %f3024, %f573, %f573, %f3023; add.f32 %f3025, %f3024, 0f00000000; mul.f32 %f3026, %f588, %f591; mul.f32 %f3027, %f591, %f3026; sub.f32 %f594, %f3025, %f3027; mul.f32 %f3028, %f589, %f5382; mul.f32 %f3029, %f5382, %f3028; sub.f32 %f595, %f3025, %f3029; mul.f32 %f3030, %f578, %f578; fma.rn.f32 %f3031, %f577, %f577, %f3030; add.f32 %f3032, %f3031, 0f00000000; mul.f32 %f3033, %f590, %f592; mul.f32 %f3034, %f592, %f3033; sub.f32 %f596, %f3032, %f3034; setp.lt.f32 %p1164, %f594, %f595; @%p1164 bra $L__BB1_811; bra.uni $L__BB1_808; $L__BB1_811: setp.lt.f32 %p1166, %f594, %f596; @%p1166 bra $L__BB1_813; bra.uni $L__BB1_812; $L__BB1_813: mul.f32 %f5381, %f568, %f591; fma.rn.f32 %f5379, %f566, %f591, %f563; mov.u32 %r5018, 0; mov.f32 %f5380, %f564; mov.f32 %f5382, %f591; bra.uni $L__BB1_814; $L__BB1_815: add.u64 %rd11996, %SP, 552; cvta.to.local.u64 %rd11994, %rd11996; add.u64 %rd12002, %SP, 0; cvta.to.local.u64 %rd12000, %rd12002; mul.f32 %f3037, %f586, %f578; fma.rn.f32 %f3038, %f585, %f577, %f3037; div.rn.f32 %f5383, %f3038, %f590; fma.rn.f32 %f3039, %f585, %f5383, %f565; mov.b32 %r2831, %f3039; fma.rn.f32 %f3040, %f586, %f5383, %f5380; mov.b32 %r2832, %f3040; cvt.u64.u32 %rd7439, %r2832; cvt.u64.u32 %rd7440, %r2831; bfi.b64 %rd1914, %rd7439, %rd7440, 32, 32; st.local.u64 [%rd12000], %rd1914; mov.u64 %rd12007, 2; mov.u64 %rd11993, %rd1724; mov.u64 %rd11995, %rd11994; mov.u64 %rd11997, %rd11994; mov.u64 %rd11998, %rd11994; mov.u64 %rd11999, %rd11996; mov.u64 %rd12001, %rd12000; mov.u64 %rd12003, %rd12000; mov.u64 %rd12004, %rd12000; mov.u64 %rd12005, %rd12002; mov.u64 %rd12006, %rd1723; $L__BB1_816: setp.eq.s64 %p1167, %rd12007, 0; mov.u64 %rd12042, 1; @%p1167 bra $L__BB1_818; add.s64 %rd12007, %rd12007, -1; add.s64 %rd7445, %rd11994, 8; setp.eq.s64 %p1168, %rd11997, %rd11993; selp.b64 %rd7446, %rd7445, %rd11997, %p1168; add.s64 %rd7447, %rd11995, 8; selp.b64 %rd7448, %rd7447, %rd11998, %p1168; add.s64 %rd7449, %rd11996, 8; selp.b64 %rd7450, %rd7449, %rd11999, %p1168; mov.u64 %rd12042, 0; setp.eq.s64 %p1169, %rd12007, 0; add.s64 %rd7451, %rd7446, 4; add.s64 %rd7452, %rd7448, 4; add.s64 %rd7453, %rd7450, 4; selp.b64 %rd1931, %rd7446, %rd7451, %p1169; selp.b64 %rd11998, %rd7448, %rd7452, %p1169; selp.b64 %rd11999, %rd7450, %rd7453, %p1169; selp.b64 %rd11994, %rd7445, %rd11994, %p1168; selp.b64 %rd11995, %rd7447, %rd11995, %p1168; selp.b64 %rd11996, %rd7449, %rd11996, %p1168; add.s64 %rd7454, %rd11997, 8; selp.b64 %rd11993, %rd7454, %rd11993, %p1168; add.s64 %rd7455, %rd12003, 8; setp.eq.s64 %p1170, %rd12000, %rd12006; selp.b64 %rd7456, %rd7455, %rd12000, %p1170; add.s64 %rd7457, %rd12004, 8; selp.b64 %rd7458, %rd7457, %rd12001, %p1170; add.s64 %rd7459, %rd12005, 8; selp.b64 %rd7460, %rd7459, %rd12002, %p1170; selp.b64 %rd12003, %rd7455, %rd12003, %p1170; selp.b64 %rd12004, %rd7457, %rd12004, %p1170; selp.b64 %rd12005, %rd7459, %rd12005, %p1170; add.s64 %rd7461, %rd12000, 8; selp.b64 %rd12006, %rd7461, %rd12006, %p1170; add.s64 %rd7462, %rd7456, 4; add.s64 %rd7463, %rd7458, 4; add.s64 %rd7464, %rd7460, 4; selp.b64 %rd12000, %rd7456, %rd7462, %p1169; selp.b64 %rd12001, %rd7458, %rd7463, %p1169; selp.b64 %rd12002, %rd7460, %rd7464, %p1169; ld.local.f32 %f3041, [%rd7458]; ld.local.f32 %f3042, [%rd7448]; setp.eq.f32 %p1171, %f3042, %f3041; mov.u64 %rd11997, %rd1931; @%p1171 bra $L__BB1_816; $L__BB1_818: mov.u64 %rd11110, 0; or.b64 %rd12041, %rd11110, %rd1914; mov.u32 %r5018, 1; bra.uni $L__BB1_827; $L__BB1_819: add.u64 %rd12012, %SP, 552; cvta.to.local.u64 %rd12010, %rd12012; add.u64 %rd12018, %SP, 0; cvta.to.local.u64 %rd12016, %rd12018; div.rn.f32 %f5383, %f576, %f589; fma.rn.f32 %f3043, %f570, %f5383, %f563; mov.b32 %r2834, %f3043; fma.rn.f32 %f3044, %f572, %f5383, %f564; mov.b32 %r2835, %f3044; cvt.u64.u32 %rd7466, %r2835; cvt.u64.u32 %rd7467, %r2834; bfi.b64 %rd1955, %rd7466, %rd7467, 32, 32; st.local.u64 [%rd12016], %rd1955; mov.u64 %rd12023, 2; mov.u64 %rd12009, %rd1724; mov.u64 %rd12011, %rd12010; mov.u64 %rd12013, %rd12010; mov.u64 %rd12014, %rd12010; mov.u64 %rd12015, %rd12012; mov.u64 %rd12017, %rd12016; mov.u64 %rd12019, %rd12016; mov.u64 %rd12020, %rd12016; mov.u64 %rd12021, %rd12018; mov.u64 %rd12022, %rd1722; $L__BB1_820: setp.eq.s64 %p1172, %rd12023, 0; mov.u64 %rd12042, 1; @%p1172 bra $L__BB1_822; add.s64 %rd12023, %rd12023, -1; add.s64 %rd7472, %rd12010, 8; setp.eq.s64 %p1173, %rd12013, %rd12009; selp.b64 %rd7473, %rd7472, %rd12013, %p1173; add.s64 %rd7474, %rd12011, 8; selp.b64 %rd7475, %rd7474, %rd12014, %p1173; add.s64 %rd7476, %rd12012, 8; selp.b64 %rd7477, %rd7476, %rd12015, %p1173; mov.u64 %rd12042, 0; setp.eq.s64 %p1174, %rd12023, 0; add.s64 %rd7478, %rd7473, 4; add.s64 %rd7479, %rd7475, 4; add.s64 %rd7480, %rd7477, 4; selp.b64 %rd1972, %rd7473, %rd7478, %p1174; selp.b64 %rd12014, %rd7475, %rd7479, %p1174; selp.b64 %rd12015, %rd7477, %rd7480, %p1174; selp.b64 %rd12010, %rd7472, %rd12010, %p1173; selp.b64 %rd12011, %rd7474, %rd12011, %p1173; selp.b64 %rd12012, %rd7476, %rd12012, %p1173; add.s64 %rd7481, %rd12013, 8; selp.b64 %rd12009, %rd7481, %rd12009, %p1173; add.s64 %rd7482, %rd12019, 8; setp.eq.s64 %p1175, %rd12016, %rd12022; selp.b64 %rd7483, %rd7482, %rd12016, %p1175; add.s64 %rd7484, %rd12020, 8; selp.b64 %rd7485, %rd7484, %rd12017, %p1175; add.s64 %rd7486, %rd12021, 8; selp.b64 %rd7487, %rd7486, %rd12018, %p1175; selp.b64 %rd12019, %rd7482, %rd12019, %p1175; selp.b64 %rd12020, %rd7484, %rd12020, %p1175; selp.b64 %rd12021, %rd7486, %rd12021, %p1175; add.s64 %rd7488, %rd12016, 8; selp.b64 %rd12022, %rd7488, %rd12022, %p1175; add.s64 %rd7489, %rd7483, 4; add.s64 %rd7490, %rd7485, 4; add.s64 %rd7491, %rd7487, 4; selp.b64 %rd12016, %rd7483, %rd7489, %p1174; selp.b64 %rd12017, %rd7485, %rd7490, %p1174; selp.b64 %rd12018, %rd7487, %rd7491, %p1174; ld.local.f32 %f3045, [%rd7485]; ld.local.f32 %f3046, [%rd7475]; setp.eq.f32 %p1176, %f3046, %f3045; mov.u64 %rd12013, %rd1972; @%p1176 bra $L__BB1_820; $L__BB1_822: mov.u64 %rd11111, 0; or.b64 %rd12041, %rd11111, %rd1955; mov.u32 %r5018, 2; bra.uni $L__BB1_827; $L__BB1_823: div.rn.f32 %f5383, %f575, %f588; fma.rn.f32 %f3047, %f566, %f5383, %f563; mov.b32 %r2837, %f3047; fma.rn.f32 %f3048, %f568, %f5383, %f564; mov.b32 %r2838, %f3048; cvt.u64.u32 %rd7493, %r2838; cvt.u64.u32 %rd7494, %r2837; bfi.b64 %rd1996, %rd7493, %rd7494, 32, 32; st.local.u64 [%rd7188], %rd1996; mov.u64 %rd12039, 2; mov.u64 %rd12025, %rd1724; mov.u64 %rd12026, %rd7200; mov.u64 %rd12027, %rd7200; mov.u64 %rd12028, %rd7199; mov.u64 %rd12029, %rd7200; mov.u64 %rd12030, %rd7200; mov.u64 %rd12031, %rd7199; mov.u64 %rd12032, %rd7188; mov.u64 %rd12033, %rd7188; mov.u64 %rd12034, %rd7187; mov.u64 %rd12035, %rd7188; mov.u64 %rd12036, %rd7188; mov.u64 %rd12037, %rd7187; mov.u64 %rd12038, %rd1721; $L__BB1_824: setp.eq.s64 %p1177, %rd12039, 0; mov.u64 %rd12042, 1; @%p1177 bra $L__BB1_826; add.s64 %rd12039, %rd12039, -1; add.s64 %rd7499, %rd12026, 8; setp.eq.s64 %p1178, %rd12029, %rd12025; selp.b64 %rd7500, %rd7499, %rd12029, %p1178; add.s64 %rd7501, %rd12027, 8; selp.b64 %rd7502, %rd7501, %rd12030, %p1178; add.s64 %rd7503, %rd12028, 8; selp.b64 %rd7504, %rd7503, %rd12031, %p1178; mov.u64 %rd12042, 0; setp.eq.s64 %p1179, %rd12039, 0; add.s64 %rd7505, %rd7500, 4; add.s64 %rd7506, %rd7502, 4; add.s64 %rd7507, %rd7504, 4; selp.b64 %rd2013, %rd7500, %rd7505, %p1179; selp.b64 %rd12030, %rd7502, %rd7506, %p1179; selp.b64 %rd12031, %rd7504, %rd7507, %p1179; selp.b64 %rd12026, %rd7499, %rd12026, %p1178; selp.b64 %rd12027, %rd7501, %rd12027, %p1178; selp.b64 %rd12028, %rd7503, %rd12028, %p1178; add.s64 %rd7508, %rd12029, 8; selp.b64 %rd12025, %rd7508, %rd12025, %p1178; add.s64 %rd7509, %rd12035, 8; setp.eq.s64 %p1180, %rd12032, %rd12038; selp.b64 %rd7510, %rd7509, %rd12032, %p1180; add.s64 %rd7511, %rd12036, 8; selp.b64 %rd7512, %rd7511, %rd12033, %p1180; add.s64 %rd7513, %rd12037, 8; selp.b64 %rd7514, %rd7513, %rd12034, %p1180; selp.b64 %rd12035, %rd7509, %rd12035, %p1180; selp.b64 %rd12036, %rd7511, %rd12036, %p1180; selp.b64 %rd12037, %rd7513, %rd12037, %p1180; add.s64 %rd7515, %rd12032, 8; selp.b64 %rd12038, %rd7515, %rd12038, %p1180; add.s64 %rd7516, %rd7510, 4; add.s64 %rd7517, %rd7512, 4; add.s64 %rd7518, %rd7514, 4; selp.b64 %rd12032, %rd7510, %rd7516, %p1179; selp.b64 %rd12033, %rd7512, %rd7517, %p1179; selp.b64 %rd12034, %rd7514, %rd7518, %p1179; ld.local.f32 %f3049, [%rd7512]; ld.local.f32 %f3050, [%rd7502]; setp.eq.f32 %p1181, %f3050, %f3049; mov.u64 %rd12029, %rd2013; @%p1181 bra $L__BB1_824; $L__BB1_826: mov.u64 %rd11112, 0; or.b64 %rd12041, %rd11112, %rd1996; mov.u32 %r5018, 0; $L__BB1_827: mov.f32 %f3051, 0f3F800000; sub.f32 %f3052, %f3051, %f5383; mov.b32 %r2841, %f3052; mov.b32 %r2842, %f5383; cvt.u64.u32 %rd7519, %r2842; cvt.u64.u32 %rd7520, %r2841; bfi.b64 %rd12091, %rd7519, %rd7520, 32, 32; mov.b64 {%r2843, %r2844}, %rd12042; mov.b64 {%r2845, %r2846}, %rd12041; cvt.u32.u64 %r5016, %rd12041; mov.b32 %f5384, %r2846; mov.u32 %r5017, 1; mov.b32 {%rs1026, %rs491}, %r2843; bra.uni $L__BB1_840; $L__BB1_808: setp.lt.f32 %p1165, %f595, %f596; @%p1165 bra $L__BB1_810; bra.uni $L__BB1_809; $L__BB1_810: mul.f32 %f5381, %f572, %f592; fma.rn.f32 %f5379, %f570, %f592, %f563; mov.u32 %r5018, 2; mov.f32 %f5380, %f564; mov.f32 %f5382, %f592; bra.uni $L__BB1_814; $L__BB1_812: mul.f32 %f5381, %f586, %f5382; fma.rn.f32 %f5379, %f585, %f5382, %f565; mov.u32 %r5018, 1; bra.uni $L__BB1_814; $L__BB1_809: mul.f32 %f5381, %f586, %f5382; fma.rn.f32 %f5379, %f585, %f5382, %f565; mov.u32 %r5018, 1; $L__BB1_814: add.f32 %f5384, %f5380, %f5381; mov.f32 %f3035, 0f3F800000; sub.f32 %f3036, %f3035, %f5382; mov.b32 %r2829, %f3036; mov.b32 %r2830, %f5382; cvt.u64.u32 %rd7436, %r2830; cvt.u64.u32 %rd7437, %r2829; bfi.b64 %rd12091, %rd7436, %rd7437, 32, 32; mov.b32 %r5016, %f5379; mov.u32 %r5017, 1; mov.u16 %rs1026, 1; $L__BB1_840: mov.b32 %f3059, %r5016; sub.f32 %f3060, %f3059, %f560; sub.f32 %f3061, %f5384, %f561; mul.f32 %f3062, %f3061, %f3061; fma.rn.f32 %f3063, %f3060, %f3060, %f3062; add.f32 %f3064, %f3063, 0f00000000; sqrt.rn.f32 %f3065, %f3064; shl.b64 %rd7604, %rd1895, 2; add.s64 %rd7605, %rd24, %rd7604; st.local.f32 [%rd7605+-4], %f3065; mul.lo.s64 %rd7606, %rd1895, 36; add.s64 %rd7607, %rd7, %rd7606; st.local.u32 [%rd7607+-36], %r5016; st.local.f32 [%rd7607+-32], %f5384; mov.u16 %rs504, 0; st.local.v4.u8 [%rd7607+-28], {%rs1026, %rs504, %rs504, %rs504}; st.local.u32 [%rd7607+-24], %r581; st.local.u32 [%rd7607+-20], %r5017; st.local.u32 [%rd7607+-16], %r5018; shr.u64 %rd7608, %rd12091, 32; st.local.u32 [%rd7607+-8], %rd7608; st.local.u32 [%rd7607+-12], %rd12091; $L__BB1_841: setp.lt.u64 %p1197, %rd1895, 4; add.s64 %rd1895, %rd1895, 1; @%p1197 bra $L__BB1_788; ld.local.v2.u64 {%rd12092, %rd12093}, [%rd24]; ld.local.v4.u32 {%r5028, %r5029, %r5030, %r2874}, [%rd7]; ld.local.u32 %r5031, [%rd7+16]; ld.local.u32 %rd7611, [%rd1735+4]; ld.local.u32 %rd7612, [%rd1735+8]; bfi.b64 %rd7613, %rd7612, %rd7611, 32, 32; mov.b64 {%r5025, %r5026}, %rd7613; ld.local.u32 %r5027, [%rd1735+12]; ld.local.u32 %r5032, [%rd1736+4]; ld.local.u32 %r5024, [%rd1737+16]; ld.local.u64 %rd7614, [%rd1737+8]; mov.b64 {%r5022, %r5023}, %rd7614; ld.local.u32 %r5033, [%rd1738+8]; ld.local.u32 %rd7615, [%rd1739+12]; ld.local.u32 %rd7616, [%rd1739+16]; bfi.b64 %rd7617, %rd7616, %rd7615, 32, 32; mov.b64 {%r5019, %r5020}, %rd7617; ld.local.u32 %r5021, [%rd1739+20]; ld.local.u32 %r5034, [%rd1740+12]; bra.uni $L__BB1_843; $L__BB1_786: mov.u32 %r5031, 4; mov.u32 %r5032, %r5031; mov.u32 %r5033, %r5031; mov.u32 %r5034, %r5031; $L__BB1_843: and.b64 %rd7618, %rd1891, 1; setp.eq.b64 %p1198, %rd7618, 1; mov.pred %p1199, 0; xor.pred %p1200, %p1198, %p1199; not.pred %p1201, %p1200; mov.b64 {%r624, %r625}, %rd12092; mov.b32 %f619, %r624; mov.b32 %f620, %r625; mov.b64 {%r626, %r627}, %rd12093; mov.b32 %f621, %r626; mov.b32 %f622, %r627; @%p1201 bra $L__BB1_852; bra.uni $L__BB1_844; $L__BB1_852: and.b64 %rd7636, %rd1891, 2; setp.eq.s64 %p1215, %rd7636, 0; @%p1215 bra $L__BB1_861; bra.uni $L__BB1_853; $L__BB1_861: and.b64 %rd7654, %rd1891, 4; setp.eq.s64 %p1229, %rd7654, 0; @%p1229 bra $L__BB1_870; bra.uni $L__BB1_862; $L__BB1_870: and.b64 %rd7672, %rd1891, 8; setp.eq.s64 %p1243, %rd7672, 0; @%p1243 bra $L__BB1_778; ld.u8 %rs511, [%rd1880+88]; and.b16 %rs512, %rs511, 1; setp.eq.b16 %p1244, %rs512, 1; mov.pred %p1245, 0; xor.pred %p1246, %p1244, %p1245; not.pred %p1247, %p1246; @%p1247 bra $L__BB1_874; bra.uni $L__BB1_872; $L__BB1_874: ld.u32 %r675, [%rd1880+76]; cvt.u64.u32 %rd7676, %r675; setp.le.u64 %p1254, %rd1868, %rd7676; @%p1254 bra $L__BB1_778; neg.f32 %f626, %f622; setp.lt.u32 %p1255, %r580, 64; @%p1255 bra $L__BB1_877; bra.uni $L__BB1_876; $L__BB1_877: mul.wide.u32 %rd7688, %r580, 8; add.s64 %rd7689, %rd1869, %rd7688; mov.u64 %rd12100, 0; st.local.u32 [%rd7689], %r675; st.local.f32 [%rd7689+4], %f626; add.s32 %r580, %r580, 1; st.local.u32 [%rd1869+512], %r580; mov.u64 %rd12101, %rd12100; bra.uni $L__BB1_878; $L__BB1_844: ld.u8 %rs505, [%rd1880+88]; and.b16 %rs506, %rs505, 1; setp.eq.b16 %p1202, %rs506, 1; xor.pred %p1204, %p1202, %p1199; not.pred %p1205, %p1204; @%p1205 bra $L__BB1_847; bra.uni $L__BB1_845; $L__BB1_847: ld.u32 %r633, [%rd1880+64]; cvt.u64.u32 %rd7622, %r633; setp.le.u64 %p1212, %rd1868, %rd7622; @%p1212 bra $L__BB1_852; neg.f32 %f623, %f619; setp.lt.u32 %p1213, %r580, 64; @%p1213 bra $L__BB1_850; bra.uni $L__BB1_849; $L__BB1_850: add.s32 %r2877, %r579, -1; mul.wide.u32 %rd7634, %r2877, 8; add.s64 %rd7635, %rd1869, %rd7634; mov.u64 %rd12094, 0; st.local.u32 [%rd7635], %r633; st.local.f32 [%rd7635+4], %f623; add.s32 %r580, %r580, 1; st.local.u32 [%rd1869+512], %r580; mov.u64 %rd12095, %rd12094; bra.uni $L__BB1_851; $L__BB1_853: ld.u8 %rs507, [%rd1880+88]; and.b16 %rs508, %rs507, 1; setp.eq.b16 %p1216, %rs508, 1; mov.pred %p1217, 0; xor.pred %p1218, %p1216, %p1217; not.pred %p1219, %p1218; @%p1219 bra $L__BB1_856; bra.uni $L__BB1_854; $L__BB1_856: ld.u32 %r647, [%rd1880+68]; cvt.u64.u32 %rd7640, %r647; setp.le.u64 %p1226, %rd1868, %rd7640; @%p1226 bra $L__BB1_861; neg.f32 %f624, %f620; setp.lt.u32 %p1227, %r580, 64; @%p1227 bra $L__BB1_859; bra.uni $L__BB1_858; $L__BB1_859: mul.wide.u32 %rd7652, %r580, 8; add.s64 %rd7653, %rd1869, %rd7652; mov.u64 %rd12096, 0; st.local.u32 [%rd7653], %r647; st.local.f32 [%rd7653+4], %f624; add.s32 %r580, %r580, 1; st.local.u32 [%rd1869+512], %r580; mov.u64 %rd12097, %rd12096; bra.uni $L__BB1_860; $L__BB1_862: ld.u8 %rs509, [%rd1880+88]; and.b16 %rs510, %rs509, 1; setp.eq.b16 %p1230, %rs510, 1; mov.pred %p1231, 0; xor.pred %p1232, %p1230, %p1231; not.pred %p1233, %p1232; @%p1233 bra $L__BB1_865; bra.uni $L__BB1_863; $L__BB1_865: ld.u32 %r661, [%rd1880+72]; cvt.u64.u32 %rd7658, %r661; setp.le.u64 %p1240, %rd1868, %rd7658; @%p1240 bra $L__BB1_870; neg.f32 %f625, %f621; setp.lt.u32 %p1241, %r580, 64; @%p1241 bra $L__BB1_868; bra.uni $L__BB1_867; $L__BB1_868: mul.wide.u32 %rd7670, %r580, 8; add.s64 %rd7671, %rd1869, %rd7670; mov.u64 %rd12098, 0; st.local.u32 [%rd7671], %r661; st.local.f32 [%rd7671+4], %f625; add.s32 %r580, %r580, 1; st.local.u32 [%rd1869+512], %r580; mov.u64 %rd12099, %rd12098; bra.uni $L__BB1_869; $L__BB1_845: setp.leu.f32 %p1206, %f562, %f619; setp.eq.s32 %p1207, %r5031, 4; or.pred %p1208, %p1207, %p1206; @%p1208 bra $L__BB1_852; ld.u32 %r2875, [%rd1880+64]; cvt.u64.u32 %rd7619, %r2875; setp.le.u64 %p1209, %rd1871, %rd7619; mul.wide.u32 %rd7620, %r2875, 12; add.s64 %rd7621, %rd1872, %rd7620; setp.eq.s64 %p1210, %rd7621, 0; or.pred %p1211, %p1209, %p1210; selp.b32 %r575, %r575, %r5030, %p1211; selp.b32 %r574, %r574, %r5029, %p1211; selp.b32 %r573, %r573, %r5028, %p1211; selp.b32 %r577, %r577, %r5031, %p1211; selp.b32 %r578, %r578, %r624, %p1211; bra.uni $L__BB1_852; $L__BB1_872: mov.b32 %f3068, %r578; setp.leu.f32 %p1248, %f3068, %f622; setp.eq.s32 %p1249, %r5034, 4; or.pred %p1250, %p1249, %p1248; @%p1250 bra $L__BB1_778; bra.uni $L__BB1_873; $L__BB1_854: mov.b32 %f3066, %r578; setp.leu.f32 %p1220, %f3066, %f620; setp.eq.s32 %p1221, %r5032, 4; or.pred %p1222, %p1221, %p1220; @%p1222 bra $L__BB1_861; ld.u32 %r2883, [%rd1880+68]; cvt.u64.u32 %rd7637, %r2883; setp.le.u64 %p1223, %rd1871, %rd7637; mul.wide.u32 %rd7638, %r2883, 12; add.s64 %rd7639, %rd1872, %rd7638; setp.eq.s64 %p1224, %rd7639, 0; or.pred %p1225, %p1223, %p1224; selp.b32 %r575, %r575, %r5027, %p1225; selp.b32 %r574, %r574, %r5026, %p1225; selp.b32 %r573, %r573, %r5025, %p1225; selp.b32 %r577, %r577, %r5032, %p1225; selp.b32 %r578, %r578, %r625, %p1225; bra.uni $L__BB1_861; $L__BB1_863: mov.b32 %f3067, %r578; setp.leu.f32 %p1234, %f3067, %f621; setp.eq.s32 %p1235, %r5033, 4; or.pred %p1236, %p1235, %p1234; @%p1236 bra $L__BB1_870; ld.u32 %r2890, [%rd1880+72]; cvt.u64.u32 %rd7655, %r2890; setp.le.u64 %p1237, %rd1871, %rd7655; mul.wide.u32 %rd7656, %r2890, 12; add.s64 %rd7657, %rd1872, %rd7656; setp.eq.s64 %p1238, %rd7657, 0; or.pred %p1239, %p1237, %p1238; selp.b32 %r575, %r575, %r5024, %p1239; selp.b32 %r574, %r574, %r5023, %p1239; selp.b32 %r573, %r573, %r5022, %p1239; selp.b32 %r577, %r577, %r5033, %p1239; selp.b32 %r578, %r578, %r626, %p1239; bra.uni $L__BB1_870; $L__BB1_849: mov.u64 %rd12095, 1; shl.b64 %rd12094, %rd7622, 32; $L__BB1_851: mov.u64 %rd11116, 0; cvt.u32.u64 %r2878, %rd11116; cvt.u32.u64 %r2879, %rd12094; or.b32 %r2880, %r2879, %r2878; cvt.u32.u64 %r2881, %rd12095; or.b32 %r2882, %r2880, %r2881; setp.ne.s32 %p1214, %r2882, 0; @%p1214 bra $L__BB1_879; bra.uni $L__BB1_852; $L__BB1_876: mov.u64 %rd12101, 1; shl.b64 %rd12100, %rd7676, 32; $L__BB1_878: mov.u64 %rd11125, 0; cvt.u32.u64 %r2899, %rd11125; cvt.u32.u64 %r2900, %rd12100; or.b32 %r2901, %r2900, %r2899; cvt.u32.u64 %r2902, %rd12101; or.b32 %r2903, %r2901, %r2902; setp.eq.s32 %p1256, %r2903, 0; @%p1256 bra $L__BB1_778; bra.uni $L__BB1_879; $L__BB1_858: mov.u64 %rd12097, 1; shl.b64 %rd12096, %rd7640, 32; $L__BB1_860: mov.u64 %rd11119, 0; cvt.u32.u64 %r2885, %rd11119; cvt.u32.u64 %r2886, %rd12096; or.b32 %r2887, %r2886, %r2885; cvt.u32.u64 %r2888, %rd12097; or.b32 %r2889, %r2887, %r2888; setp.ne.s32 %p1228, %r2889, 0; @%p1228 bra $L__BB1_879; bra.uni $L__BB1_861; $L__BB1_867: mov.u64 %rd12099, 1; shl.b64 %rd12098, %rd7658, 32; $L__BB1_869: mov.u64 %rd11122, 0; cvt.u32.u64 %r2892, %rd11122; cvt.u32.u64 %r2893, %rd12098; or.b32 %r2894, %r2893, %r2892; cvt.u32.u64 %r2895, %rd12099; or.b32 %r2896, %r2894, %r2895; setp.ne.s32 %p1242, %r2896, 0; @%p1242 bra $L__BB1_879; bra.uni $L__BB1_870; $L__BB1_880: setp.eq.s32 %p1257, %r577, 4; mov.u64 %rd12102, %rd7344; mov.u64 %rd12103, %rd7342; mov.u64 %rd12104, %rd7344; @%p1257 bra $L__BB1_882; mov.b64 %rd12104, {%r573, %r574}; mov.b32 {%rs513, %rs514}, %r575; mov.b64 %rd7696, {%r575, %r2904}; and.b64 %rd12102, %rd7696, 4294967040; cvt.u64.u16 %rd7697, %rs513; and.b64 %rd12103, %rd7697, 255; $L__BB1_882: or.b64 %rd7704, %rd12103, %rd12102; or.b64 %rd7705, %rd7704, %rd7344; mov.b64 {%r2905, %r2906}, %rd7705; mov.b32 {%rs86, %rs515}, %r2905; and.b16 %rs516, %rs86, 255; setp.eq.s16 %p1258, %rs516, 2; @%p1258 bra $L__BB1_884; cvt.u32.u64 %r2907, %rd12104; mov.b32 %f3069, %r2907; shr.u64 %rd7706, %rd12104, 32; cvt.u32.u64 %r2908, %rd7706; mov.b32 %f3070, %r2908; ld.global.f32 %f3071, [%rd1746+-24]; mul.f32 %f3072, %f3071, %f3069; ld.global.f32 %f3073, [%rd1746+-20]; mul.f32 %f3074, %f3073, %f3070; sub.f32 %f3075, %f3072, %f3074; mul.f32 %f3076, %f3073, %f3069; fma.rn.f32 %f3077, %f3071, %f3070, %f3076; ld.global.f32 %f3078, [%rd1746+-16]; add.f32 %f3079, %f3078, %f3075; mov.b32 %r2909, %f3079; ld.global.f32 %f3080, [%rd1746+-12]; add.f32 %f3081, %f3080, %f3077; mov.b32 %r2910, %f3081; cvt.u64.u32 %rd7707, %r2910; cvt.u64.u32 %rd7708, %r2909; cvt.u64.u16 %rd7709, %rs86; bfi.b64 %rd7344, %rd7707, %rd7708, 32, 32; and.b64 %rd7710, %rd7709, 255; mov.b64 {%r2911, %r2912}, %rd7710; mov.b32 {%rs517, %rs518}, %r2911; cvt.u64.u16 %rd7342, %rs517; $L__BB1_884: mov.u64 %rd11134, 0; or.b64 %rd7717, %rd11134, %rd7342; or.b64 %rd2219, %rd7717, %rd11134; mov.b64 {%r2913, %r2914}, %rd2219; mov.b32 {%rs87, %rs519}, %r2913; and.b16 %rs520, %rs87, 255; setp.eq.s16 %p1259, %rs520, 2; mov.u64 %rd12107, 2; mov.u64 %rd12108, %rd11134; mov.u64 %rd12109, %rd11134; @%p1259 bra $L__BB1_886; and.b64 %rd7719, %rd2219, 4294967040; cvt.u64.u16 %rd7720, %rs87; and.b64 %rd7721, %rd7720, 255; or.b64 %rd7722, %rd7721, %rd11134; or.b64 %rd7723, %rd7722, %rd7719; mov.b64 {%r2915, %r2916}, %rd7723; mov.b32 {%rs521, %rs522}, %r2915; not.b16 %rs523, %rs521; ld.global.u8 %rs524, [%rd1746+-32]; setp.eq.s16 %p1260, %rs524, 0; and.b16 %rs525, %rs523, 1; selp.b16 %rs526, %rs521, %rs525, %p1260; and.b64 %rd7724, %rd7723, 4294967040; cvt.u64.u16 %rd7725, %rs526; and.b64 %rd7726, %rd7725, 255; or.b64 %rd7727, %rd7724, %rd11134; or.b64 %rd7728, %rd7727, %rd7726; mov.b64 {%r2917, %r2918}, %rd7728; mov.b32 {%rs527, %rs528}, %r2917; and.b64 %rd12109, %rd7728, 4294967040; cvt.u64.u16 %rd7729, %rs527; and.b64 %rd12107, %rd7729, 255; mov.u64 %rd12108, %rd7344; $L__BB1_886: or.b64 %rd7730, %rd12108, %rd11134; or.b64 %rd7731, %rd11134, %rd12107; or.b64 %rd7732, %rd7731, %rd12109; or.b64 %rd7733, %rd7730, %rd11134; mov.b64 {%r5065, %r5066}, %rd7733; mov.b64 {%r5067, %r2919}, %rd7732; bra.uni $L__BB1_943; $L__BB1_746: cvt.u32.u64 %r2712, %rd1748; cvt.u32.u64 %r2713, %rd1767; rem.u32 %r2714, %r2713, %r2712; cvt.u64.u32 %rd11949, %r2714; $L__BB1_747: shl.b64 %rd7247, %rd11949, 3; add.s64 %rd1771, %rd1749, %rd7247; ld.u32 %rd7248, [%rd1771]; ld.u32 %rd7249, [%rd1771+4]; bfi.b64 %rd1772, %rd7249, %rd7248, 32, 32; add.s64 %rd1773, %rd11949, 1; or.b64 %rd7250, %rd1773, %rd1748; and.b64 %rd7251, %rd7250, -4294967296; setp.eq.s64 %p1077, %rd7251, 0; @%p1077 bra $L__BB1_749; rem.u64 %rd11950, %rd1773, %rd1748; bra.uni $L__BB1_750; $L__BB1_749: cvt.u32.u64 %r2715, %rd1748; cvt.u32.u64 %r2716, %rd1773; rem.u32 %r2717, %r2716, %r2715; cvt.u64.u32 %rd11950, %r2717; $L__BB1_750: add.u64 %rd11960, %SP, 560; cvta.to.local.u64 %rd11958, %rd11960; shl.b64 %rd7253, %rd11950, 3; add.s64 %rd1783, %rd1749, %rd7253; ld.u32 %rd7254, [%rd1783]; ld.u32 %rd7255, [%rd1783+4]; bfi.b64 %rd7256, %rd7255, %rd7254, 32, 32; st.local.v2.u64 [%rd11958], {%rd1772, %rd7256}; mov.u64 %rd11965, 2; mov.u64 %rd11951, %rd1730; mov.u64 %rd11952, %rd1728; mov.u64 %rd11953, %rd1728; mov.u64 %rd11954, %rd1729; mov.u64 %rd11955, %rd1728; mov.u64 %rd11956, %rd1728; mov.u64 %rd11957, %rd1729; mov.u64 %rd11959, %rd11958; mov.u64 %rd11961, %rd11958; mov.u64 %rd11962, %rd11958; mov.u64 %rd11963, %rd11960; mov.u64 %rd11964, %rd1731; $L__BB1_751: setp.eq.s64 %p1078, %rd11965, 0; @%p1078 bra $L__BB1_754; add.s64 %rd11965, %rd11965, -1; add.s64 %rd7257, %rd11952, 8; setp.eq.s64 %p1079, %rd11955, %rd11951; selp.b64 %rd7258, %rd7257, %rd11955, %p1079; add.s64 %rd7259, %rd11953, 8; selp.b64 %rd7260, %rd7259, %rd11956, %p1079; add.s64 %rd7261, %rd11954, 8; selp.b64 %rd7262, %rd7261, %rd11957, %p1079; setp.eq.s64 %p1080, %rd11965, 0; add.s64 %rd7263, %rd7258, 4; add.s64 %rd7264, %rd7260, 4; add.s64 %rd7265, %rd7262, 4; selp.b64 %rd1800, %rd7258, %rd7263, %p1080; selp.b64 %rd11956, %rd7260, %rd7264, %p1080; selp.b64 %rd11957, %rd7262, %rd7265, %p1080; selp.b64 %rd11952, %rd7257, %rd11952, %p1079; selp.b64 %rd11953, %rd7259, %rd11953, %p1079; selp.b64 %rd11954, %rd7261, %rd11954, %p1079; add.s64 %rd7266, %rd11955, 8; selp.b64 %rd11951, %rd7266, %rd11951, %p1079; add.s64 %rd7267, %rd11961, 8; setp.eq.s64 %p1081, %rd11958, %rd11964; selp.b64 %rd7268, %rd7267, %rd11958, %p1081; add.s64 %rd7269, %rd11962, 8; selp.b64 %rd7270, %rd7269, %rd11959, %p1081; add.s64 %rd7271, %rd11963, 8; selp.b64 %rd7272, %rd7271, %rd11960, %p1081; selp.b64 %rd11961, %rd7267, %rd11961, %p1081; selp.b64 %rd11962, %rd7269, %rd11962, %p1081; selp.b64 %rd11963, %rd7271, %rd11963, %p1081; add.s64 %rd7273, %rd11958, 8; selp.b64 %rd11964, %rd7273, %rd11964, %p1081; add.s64 %rd7274, %rd7268, 4; add.s64 %rd7275, %rd7270, 4; add.s64 %rd7276, %rd7272, 4; selp.b64 %rd11958, %rd7268, %rd7274, %p1080; selp.b64 %rd11959, %rd7270, %rd7275, %p1080; selp.b64 %rd11960, %rd7272, %rd7276, %p1080; ld.local.f32 %f2835, [%rd7270]; ld.local.f32 %f2836, [%rd7260]; setp.eq.f32 %p1082, %f2836, %f2835; mov.u64 %rd11955, %rd1800; @%p1082 bra $L__BB1_751; bra.uni $L__BB1_753; $L__BB1_754: ld.u32 %rd7277, [%rd1771]; ld.u32 %rd7278, [%rd1771+4]; bfi.b64 %rd7279, %rd7278, %rd7277, 32, 32; cvt.u32.u64 %r2718, %rd7279; mov.b32 %f2837, %r2718; shr.u64 %rd7280, %rd7279, 32; cvt.u32.u64 %r2719, %rd7280; mov.b32 %f2838, %r2719; ld.u32 %rd7281, [%rd1783]; ld.u32 %rd7282, [%rd1783+4]; bfi.b64 %rd7283, %rd7282, %rd7281, 32, 32; cvt.u32.u64 %r2720, %rd7283; shr.u64 %rd7284, %rd7283, 32; cvt.u32.u64 %r2721, %rd7284; mov.b32 %f2839, %r2720; sub.f32 %f5377, %f2839, %f2837; mov.b32 %f2840, %r2721; sub.f32 %f5378, %f2840, %f2838; bra.uni $L__BB1_765; $L__BB1_759: cvt.u32.u64 %r2722, %rd1748; cvt.u32.u64 %r2723, %rd1814; rem.u32 %r2724, %r2723, %r2722; cvt.u64.u32 %rd11966, %r2724; $L__BB1_760: shl.b64 %rd7293, %rd11966, 3; add.s64 %rd7294, %rd1749, %rd7293; ld.u32 %rd7295, [%rd7294]; ld.u32 %rd7296, [%rd7294+4]; bfi.b64 %rd1825, %rd7296, %rd7295, 32, 32; add.u64 %rd7298, %SPL, 560; st.local.v2.u64 [%rd7298], {%rd1815, %rd1825}; mov.u64 %rd11981, 2; mov.u64 %rd11967, %rd1728; mov.u64 %rd11968, %rd1725; mov.u64 %rd11969, %rd1725; mov.u64 %rd11970, %rd1727; mov.u64 %rd11971, %rd1725; mov.u64 %rd11972, %rd1725; mov.u64 %rd11973, %rd1727; mov.u64 %rd11974, %rd1732; mov.u64 %rd11975, %rd1732; mov.u64 %rd11976, %rd1733; mov.u64 %rd11977, %rd1732; mov.u64 %rd11978, %rd1732; mov.u64 %rd11979, %rd1733; mov.u64 %rd11980, %rd1734; $L__BB1_761: setp.eq.s64 %p1086, %rd11981, 0; @%p1086 bra $L__BB1_764; add.s64 %rd11981, %rd11981, -1; add.s64 %rd7299, %rd11968, 8; setp.eq.s64 %p1087, %rd11971, %rd11967; selp.b64 %rd7300, %rd7299, %rd11971, %p1087; add.s64 %rd7301, %rd11969, 8; selp.b64 %rd7302, %rd7301, %rd11972, %p1087; add.s64 %rd7303, %rd11970, 8; selp.b64 %rd7304, %rd7303, %rd11973, %p1087; setp.eq.s64 %p1088, %rd11981, 0; add.s64 %rd7305, %rd7300, 4; add.s64 %rd7306, %rd7302, 4; add.s64 %rd7307, %rd7304, 4; selp.b64 %rd1842, %rd7300, %rd7305, %p1088; selp.b64 %rd11972, %rd7302, %rd7306, %p1088; selp.b64 %rd11973, %rd7304, %rd7307, %p1088; selp.b64 %rd11968, %rd7299, %rd11968, %p1087; selp.b64 %rd11969, %rd7301, %rd11969, %p1087; selp.b64 %rd11970, %rd7303, %rd11970, %p1087; add.s64 %rd7308, %rd11971, 8; selp.b64 %rd11967, %rd7308, %rd11967, %p1087; add.s64 %rd7309, %rd11977, 8; setp.eq.s64 %p1089, %rd11974, %rd11980; selp.b64 %rd7310, %rd7309, %rd11974, %p1089; add.s64 %rd7311, %rd11978, 8; selp.b64 %rd7312, %rd7311, %rd11975, %p1089; add.s64 %rd7313, %rd11979, 8; selp.b64 %rd7314, %rd7313, %rd11976, %p1089; selp.b64 %rd11977, %rd7309, %rd11977, %p1089; selp.b64 %rd11978, %rd7311, %rd11978, %p1089; selp.b64 %rd11979, %rd7313, %rd11979, %p1089; add.s64 %rd7315, %rd11974, 8; selp.b64 %rd11980, %rd7315, %rd11980, %p1089; add.s64 %rd7316, %rd7310, 4; add.s64 %rd7317, %rd7312, 4; add.s64 %rd7318, %rd7314, 4; selp.b64 %rd11974, %rd7310, %rd7316, %p1088; selp.b64 %rd11975, %rd7312, %rd7317, %p1088; selp.b64 %rd11976, %rd7314, %rd7318, %p1088; ld.local.f32 %f2841, [%rd7312]; ld.local.f32 %f2842, [%rd7302]; setp.eq.f32 %p1090, %f2842, %f2841; mov.u64 %rd11971, %rd1842; @%p1090 bra $L__BB1_761; bra.uni $L__BB1_763; $L__BB1_764: cvt.u32.u64 %r2725, %rd1815; mov.b32 %f2843, %r2725; shr.u64 %rd7319, %rd1815, 32; cvt.u32.u64 %r2726, %rd7319; mov.b32 %f2844, %r2726; shr.u64 %rd7320, %rd1825, 32; cvt.u32.u64 %r2727, %rd7320; cvt.u32.u64 %r2728, %rd1825; mov.b32 %f2845, %r2728; sub.f32 %f2846, %f2845, %f2843; mov.b32 %f2847, %r2727; sub.f32 %f2848, %f2847, %f2844; neg.f32 %f5377, %f2846; neg.f32 %f5378, %f2848; $L__BB1_765: mul.f32 %f2849, %f552, %f5378; fma.rn.f32 %f559, %f551, %f5377, %f2849; mul.f32 %f2850, %f5378, %f5378; fma.rn.f32 %f2851, %f5377, %f5377, %f2850; add.f32 %f2852, %f2851, 0f00000000; sqrt.rn.f32 %f2853, %f2852; mul.f32 %f2854, %f2853, 0f3A83126F; abs.f32 %f2855, %f559; setp.gt.f32 %p1091, %f2855, %f2854; @%p1091 bra $L__BB1_767; bra.uni $L__BB1_766; $L__BB1_767: setp.ge.f32 %p2926, %f559, 0f00000000; bra.uni $L__BB1_770; $L__BB1_766: ld.local.u64 %rd7321, [%rd1747+8]; cvt.u32.u64 %r2729, %rd7321; mov.b32 %f2856, %r2729; shr.u64 %rd7322, %rd7321, 32; cvt.u32.u64 %r2730, %rd7322; mov.b32 %f2857, %r2730; sub.f32 %f2858, %f518, %f2856; sub.f32 %f2859, %f519, %f2857; mul.f32 %f2860, %f552, %f2859; fma.rn.f32 %f2861, %f551, %f2858, %f2860; setp.le.f32 %p2926, %f2861, 0f00000000; $L__BB1_770: selp.u16 %rs464, 1, 0, %p2926; st.local.u8 [%rd1747+16], %rs464; $L__BB1_771: ld.local.v2.u32 {%r5003, %r5004}, [%rd1747+8]; ld.local.u32 %r5005, [%rd1747+16]; $L__BB1_773: setp.eq.s32 %p1092, %r558, 2; mov.u64 %rd7330, 0; mov.u64 %rd11982, 2; mov.u64 %rd11983, %rd7330; @%p1092 bra $L__BB1_775; setp.ne.s16 %p1093, %rs71, 0; cvt.u16.u32 %rs466, %r5005; selp.u16 %rs467, 1, 0, %p1093; xor.b16 %rs468, %rs466, %rs467; mov.b32 %f2868, %r5003; mov.b32 %f2869, %r5004; mul.f32 %f2870, %f522, %f2868; ld.global.f32 %f2871, [%rd1746+-20]; mul.f32 %f2872, %f2871, %f2869; sub.f32 %f2873, %f2870, %f2872; mul.f32 %f2874, %f2871, %f2868; fma.rn.f32 %f2875, %f522, %f2869, %f2874; add.f32 %f2876, %f520, %f2873; mov.b32 %r2735, %f2876; add.f32 %f2877, %f521, %f2875; mov.b32 %r2736, %f2877; cvt.u64.u32 %rd7331, %r2736; cvt.u64.u32 %rd7332, %r2735; cvt.u64.u16 %rd7333, %rs468; bfi.b64 %rd11983, %rd7331, %rd7332, 32, 32; and.b64 %rd7334, %rd7333, 255; mov.b64 {%r2737, %r2738}, %rd7334; mov.b32 {%rs469, %rs470}, %r2737; cvt.u64.u16 %rd11982, %rs469; $L__BB1_775: or.b64 %rd7335, %rd7330, %rd7330; or.b64 %rd7336, %rd11982, %rd7330; or.b64 %rd7337, %rd7336, %rd7330; or.b64 %rd7338, %rd7335, %rd11983; mov.b64 {%r5065, %r5066}, %rd7338; mov.b64 {%r5067, %r2739}, %rd7337; $L__BB1_943: mov.b32 {%rs92, %rs538}, %r5067; and.b16 %rs539, %rs92, 255; setp.eq.s16 %p1338, %rs539, 2; @%p1338 bra $L__BB1_945; mov.b64 %rd7815, {%r5067, %r2990}; shr.u64 %rd7816, %rd7815, 8; and.b64 %rd7817, %rd7816, 16777215; cvt.u64.u16 %rd7818, %rs92; and.b64 %rd7819, %rd7818, 255; mov.b64 %rd7207, {%r5065, %r5066}; bfi.b64 %rd2275, %rd7817, %rd7819, 8, 56; mov.b64 {%r2660, %r2991}, %rd2275; $L__BB1_945: mov.b32 {%rs540, %rs541}, %r2660; and.b16 %rs542, %rs540, 255; setp.eq.s16 %p1339, %rs542, 2; cvt.u64.u16 %rd7820, %rs540; and.b64 %rd7821, %rd7820, 255; selp.b64 %rd7822, 2, %rd7821, %p1339; mov.b64 %rd7823, {%r2660, %r2992}; and.b64 %rd7824, %rd7823, 4294967040; or.b64 %rd2281, %rd7824, %rd7822; mov.b64 {%r2993, %r2994}, %rd2281; mov.b32 {%rs93, %rs543}, %r2993; and.b16 %rs544, %rs93, 255; setp.eq.s16 %p1340, %rs544, 2; @%p1340 bra $L__BB1_947; bra.uni $L__BB1_946; $L__BB1_947: setp.ne.s64 %p1341, %rd1745, 0; add.s64 %rd11941, %rd1743, 280; add.s64 %rd11942, %rd1744, 280; @%p1341 bra $L__BB1_714; $L__BB1_948: add.s64 %rd2320, %rd1743, 280; add.s64 %rd2321, %rd1744, 280; mov.u64 %rd7207, %rd7182; bra.uni $L__BB1_949; $L__BB1_946: add.s64 %rd2320, %rd1743, 280; add.s64 %rd2321, %rd1744, 280; shl.b64 %rd7825, %rd2281, 16; shr.u64 %rd7826, %rd7825, 24; cvt.u64.u16 %rd7827, %rs93; and.b64 %rd7828, %rd7827, 255; bfi.b64 %rd7829, %rd7826, %rd7828, 8, 56; mov.b64 {%r2657, %r2995}, %rd7829; $L__BB1_949: mov.b32 {%rs545, %rs546}, %r2657; and.b16 %rs547, %rs545, 255; setp.eq.s16 %p1342, %rs547, 2; cvt.u64.u16 %rd7832, %rs545; and.b64 %rd7833, %rd7832, 255; selp.b64 %rd7834, 2, %rd7833, %p1342; mov.b64 %rd7835, {%r2657, %r2999}; and.b64 %rd7836, %rd7835, 4294967040; or.b64 %rd7837, %rd7836, %rd7182; or.b64 %rd2294, %rd7837, %rd7834; mov.b64 {%r3000, %r3001}, %rd2294; mov.b32 {%rs94, %rs548}, %r3000; and.b16 %rs549, %rs94, 255; setp.eq.s16 %p1343, %rs549, 2; mov.f32 %f858, 0f00000000; @%p1343 bra $L__BB1_1185; and.b64 %rd7838, %rd2294, 4294967040; cvt.u64.u16 %rd7839, %rs94; and.b64 %rd7840, %rd7839, 255; or.b64 %rd7841, %rd7840, %rd7182; or.b64 %rd7842, %rd7841, %rd7838; mov.b64 {%r3002, %r3003}, %rd7842; mov.b32 {%rs550, %rs551}, %r3002; shr.u64 %rd7843, %rd7207, 32; cvt.u32.u64 %r3004, %rd7843; cvt.u32.u64 %r3005, %rd7207; mov.b32 %f3183, %r3005; sub.f32 %f3184, %f3183, %f518; mov.b32 %f3185, %r3004; sub.f32 %f3186, %f3185, %f519; mul.f32 %f3187, %f3186, %f3186; fma.rn.f32 %f3188, %f3184, %f3184, %f3187; add.f32 %f3189, %f3188, 0f00000000; sqrt.rn.f32 %f3190, %f3189; and.b16 %rs552, %rs550, 1; setp.eq.b16 %p1344, %rs552, 1; selp.f32 %f3191, 0fBF800000, 0f3F800000, %p1344; mul.f32 %f687, %f3191, %f3190; setp.eq.s64 %p1345, %rd2321, 0; setp.eq.s64 %p1346, %rd1745, 0; or.pred %p1347, %p1345, %p1346; @%p1347 bra $L__BB1_1183; add.u64 %rd7844, %SP, 560; add.u64 %rd7845, %SPL, 560; add.s64 %rd2295, %rd7845, 8; add.u64 %rd7848, %SP, 0; add.u64 %rd7849, %SPL, 0; add.s64 %rd2296, %rd7849, 8; add.s64 %rd2297, %rd7849, 8; add.s64 %rd2298, %rd7849, 8; add.s64 %rd2299, %rd7849, 8; add.s64 %rd2300, %rd7849, 8; add.s64 %rd2301, %rd7849, 8; add.u64 %rd7860, %SP, 552; add.u64 %rd7861, %SPL, 552; add.s64 %rd2302, %rd7861, 8; add.u64 %rd7862, %SP, 32; add.u64 %rd7863, %SPL, 32; add.s64 %rd2303, %rd7863, 36; add.s64 %rd2304, %rd7863, 4; add.s64 %rd2305, %rd7862, 36; add.s64 %rd2306, %rd7863, 44; add.s64 %rd2307, %rd7862, 44; add.s64 %rd2308, %rd7863, 52; add.s64 %rd2309, %rd7845, 8; add.s64 %rd2310, %rd7845, 8; or.b64 %rd2311, %rd7844, 8; add.s64 %rd2312, %rd7845, 16; add.s64 %rd2313, %rd7, 32; add.s64 %rd2314, %rd7, 48; add.s64 %rd2315, %rd7, 64; add.s64 %rd2316, %rd7, 80; add.s64 %rd2317, %rd7, 96; add.s64 %rd2318, %rd7, 112; $L__BB1_952: add.s64 %rd1745, %rd1745, -1; ld.global.u32 %r3006, [%rd2320+272]; setp.eq.s32 %p1348, %r3006, 3; @%p1348 bra $L__BB1_1182; ld.global.u16 %rs553, [%rd2320]; setp.eq.s16 %p1349, %rs553, 1; @%p1349 bra $L__BB1_1124; setp.eq.s16 %p1350, %rs553, 2; @%p1350 bra $L__BB1_1013; setp.ne.s16 %p1351, %rs553, 3; @%p1351 bra $L__BB1_1162; ld.global.u8 %rs95, [%rd2320+24]; ld.global.f32 %f688, [%rd2320+256]; sub.f32 %f3192, %f518, %f688; ld.global.f32 %f689, [%rd2320+260]; sub.f32 %f3193, %f519, %f689; ld.global.f32 %f3194, [%rd2320+252]; ld.global.f32 %f690, [%rd2320+248]; mul.f32 %f3195, %f3193, %f3194; fma.rn.f32 %f691, %f3192, %f690, %f3195; mul.f32 %f3196, %f3192, %f3194; mul.f32 %f3197, %f3193, %f690; sub.f32 %f692, %f3197, %f3196; cvta.to.local.u64 %rd2324, %rd7862; mov.u32 %r744, 2; st.local.u32 [%rd2324+20], %r744; ld.global.u64 %rd2325, [%rd2320+16]; setp.eq.s64 %p1352, %rd2325, 0; @%p1352 bra $L__BB1_1010; mov.b32 %r3021, %f692; ld.global.u64 %rd2326, [%rd2320+8]; mov.b32 %r3022, %f691; and.b32 %r3023, %r3022, 2147483647; mov.b32 %f693, %r3023; and.b32 %r3024, %r3021, 2147483647; mov.b32 %f694, %r3024; mov.u64 %rd12132, 1; bra.uni $L__BB1_958; $L__BB1_966: sub.f32 %f3209, %f5394, %f691; abs.f32 %f709, %f3209; setp.le.f32 %p1362, %f709, 0f34000000; @%p1362 bra $L__BB1_968; abs.f32 %f3210, %f5394; abs.f32 %f3211, %f691; setp.gt.f32 %p1364, %f3211, %f3210; selp.f32 %f3212, %f3211, %f3210, %p1364; mul.f32 %f3213, %f3212, 0f34000000; setp.gtu.f32 %p1365, %f709, %f3213; @%p1365 bra $L__BB1_972; bra.uni $L__BB1_968; $L__BB1_958: shl.b64 %rd7871, %rd12132, 3; add.s64 %rd7872, %rd2326, %rd7871; setp.eq.s64 %p1353, %rd12132, %rd2325; selp.b64 %rd7873, 0, %rd12132, %p1353; shl.b64 %rd7874, %rd7873, 3; add.s64 %rd7875, %rd2326, %rd7874; ld.u32 %rd7876, [%rd7872+-8]; ld.u32 %rd7877, [%rd7872+-4]; bfi.b64 %rd2329, %rd7877, %rd7876, 32, 32; ld.u32 %rd7878, [%rd7875]; ld.u32 %rd7879, [%rd7875+4]; bfi.b64 %rd2330, %rd7879, %rd7878, 32, 32; cvt.u32.u64 %r5071, %rd2329; mov.b32 %f5394, %r5071; shr.u64 %rd7880, %rd2329, 32; cvt.u32.u64 %r3027, %rd7880; mov.b32 %f697, %r3027; cvt.u32.u64 %r728, %rd2330; shr.u64 %rd7881, %rd2330, 32; cvt.u32.u64 %r3028, %rd7881; mov.b32 %f698, %r728; sub.f32 %f699, %f698, %f5394; mov.b32 %f3199, %r3028; sub.f32 %f700, %f3199, %f697; sub.f32 %f3200, %f691, %f5394; sub.f32 %f3201, %f692, %f697; mul.f32 %f3202, %f700, %f3201; fma.rn.f32 %f701, %f699, %f3200, %f3202; mul.f32 %f3203, %f700, %f700; fma.rn.f32 %f3204, %f699, %f699, %f3203; add.f32 %f702, %f3204, 0f00000000; setp.gtu.f32 %p1354, %f701, 0f00000000; mov.b64 {%r3029, %r5072}, %rd2329; mov.b64 {%r3030, %r730}, %rd2330; @%p1354 bra $L__BB1_960; bra.uni $L__BB1_959; $L__BB1_960: setp.ltu.f32 %p1355, %f701, %f702; @%p1355 bra $L__BB1_962; bra.uni $L__BB1_961; $L__BB1_962: setp.eq.f32 %p1356, %f702, 0f00000000; @%p1356 bra $L__BB1_1009; div.rn.f32 %f3205, %f701, %f702; mov.f32 %f3206, 0f3F800000; sub.f32 %f3207, %f3206, %f3205; mov.b32 %r5074, %f3207; mov.b32 %r5075, %f3205; fma.rn.f32 %f5394, %f699, %f3205, %f5394; mov.b32 %r5071, %f5394; fma.rn.f32 %f5395, %f700, %f3205, %f697; mov.b32 %r5072, %f5395; mov.u32 %r5073, 1; bra.uni $L__BB1_964; $L__BB1_959: mov.b32 %f5395, %r5072; mov.u32 %r5073, 0; mov.u32 %r5074, %r5073; bra.uni $L__BB1_964; $L__BB1_961: mov.b32 %f5395, %r730; mov.u32 %r5074, 1; mov.u32 %r5073, 0; mov.f32 %f5394, %f698; mov.u32 %r5071, %r728; mov.u32 %r5072, %r730; $L__BB1_964: setp.eq.f32 %p1357, %f691, %f5394; @%p1357 bra $L__BB1_968; bra.uni $L__BB1_965; $L__BB1_968: setp.eq.f32 %p1367, %f5395, %f692; mov.pred %p1366, -1; mov.pred %p2931, %p1366; @%p1367 bra $L__BB1_972; setp.eq.f32 %p1369, %f694, 0f7F800000; and.b32 %r3039, %r5072, 2147483647; mov.b32 %f3214, %r3039; setp.eq.f32 %p1370, %f3214, 0f7F800000; or.pred %p1371, %p1369, %p1370; mov.pred %p2931, 0; @%p1371 bra $L__BB1_972; sub.f32 %f3215, %f5395, %f692; abs.f32 %f710, %f3215; setp.le.f32 %p1373, %f710, 0f34000000; mov.pred %p2931, %p1366; @%p1373 bra $L__BB1_972; abs.f32 %f3216, %f5395; abs.f32 %f3217, %f692; setp.gt.f32 %p1374, %f3217, %f3216; selp.f32 %f3218, %f3217, %f3216, %p1374; mul.f32 %f3219, %f3218, 0f34000000; setp.le.f32 %p2931, %f710, %f3219; bra.uni $L__BB1_972; $L__BB1_965: setp.eq.f32 %p1359, %f693, 0f7F800000; and.b32 %r3038, %r5071, 2147483647; mov.b32 %f3208, %r3038; setp.eq.f32 %p1360, %f3208, 0f7F800000; or.pred %p1361, %p1359, %p1360; mov.pred %p2931, 0; @%p1361 bra $L__BB1_972; bra.uni $L__BB1_966; $L__BB1_972: cvt.u64.u32 %rd7882, %r5072; cvt.u64.u32 %rd7883, %r5071; bfi.b64 %rd2331, %rd7882, %rd7883, 32, 32; mov.b64 {%r3040, %r3041}, %rd2331; selp.u64 %rd2332, 1, 0, %p2931; mov.b32 %f712, %r3041; mov.b32 %f711, %r3040; sub.f32 %f3220, %f711, %f691; sub.f32 %f3221, %f712, %f692; mul.f32 %f3222, %f3221, %f3221; fma.rn.f32 %f3223, %f3220, %f3220, %f3222; add.f32 %f3224, %f3223, 0f00000000; sqrt.rn.f32 %f714, %f3224; setp.geu.f32 %p1375, %f714, %f5396; setp.ne.s32 %p1376, %r744, 2; and.pred %p1377, %p1376, %p1375; @%p1377 bra $L__BB1_974; add.s64 %rd12133, %rd12132, -1; st.local.u64 [%rd2324], %rd12133; st.local.v2.f32 [%rd2324+8], {%f711, %f712}; mov.b64 {%r3044, %r3045}, %rd2332; st.local.v2.u32 [%rd2324+16], {%r3044, %r5073}; st.local.v2.u32 [%rd2324+24], {%r5074, %r5075}; st.local.f32 [%rd2324+32], %f714; st.local.u32 [%rd2324+36], %rd2329; st.local.u32 [%rd2324+44], %rd2330; st.local.u32 [%rd2324+40], %rd7880; st.local.u32 [%rd2324+48], %rd7881; mov.f32 %f5396, %f714; mov.u32 %r744, %r5073; $L__BB1_974: add.s64 %rd2335, %rd12132, 1; setp.lt.u64 %p1378, %rd12132, %rd2325; mov.u64 %rd12132, %rd2335; @%p1378 bra $L__BB1_958; ld.local.u32 %rd7890, [%rd2324+36]; ld.local.u32 %rd7891, [%rd2324+40]; bfi.b64 %rd7892, %rd7891, %rd7890, 32, 32; mov.u64 %rd7889, 0; cvt.u32.u64 %r3046, %rd7892; mov.b32 %f3225, %r3046; shr.u64 %rd7893, %rd7892, 32; cvt.u32.u64 %r3047, %rd7893; mov.b32 %f3226, %r3047; ld.local.u32 %rd7894, [%rd2324+44]; ld.local.u32 %rd7895, [%rd2324+48]; bfi.b64 %rd7896, %rd7895, %rd7894, 32, 32; cvt.u32.u64 %r3048, %rd7896; shr.u64 %rd7897, %rd7896, 32; cvt.u32.u64 %r3049, %rd7897; mov.b32 %f3227, %r3048; sub.f32 %f716, %f3227, %f3225; mov.b32 %f3228, %r3049; sub.f32 %f717, %f3228, %f3226; mul.f32 %f3229, %f717, %f717; fma.rn.f32 %f3230, %f716, %f716, %f3229; add.f32 %f718, %f3230, 0f00000000; setp.leu.f32 %p1379, %f718, 0f28800000; mov.u64 %rd12134, %rd7889; mov.u64 %rd12135, %rd7889; mov.u64 %rd12136, %rd7889; @%p1379 bra $L__BB1_977; neg.f32 %f3231, %f716; sqrt.rn.f32 %f3232, %f718; div.rn.f32 %f3233, %f717, %f3232; div.rn.f32 %f3234, %f3231, %f3232; mov.b32 %r3050, %f3234; mov.b32 %r3051, %f3233; mov.u64 %rd12136, 1; mov.b64 %rd7900, {%r3051, %r3050}; shr.u64 %rd12135, %rd7900, 32; shl.b64 %rd12134, %rd7900, 32; $L__BB1_977: or.b64 %rd2342, %rd12136, %rd12134; or.b64 %rd2343, %rd7889, %rd12135; and.b64 %rd7901, %rd7889, 4294967295; xor.b64 %rd7902, %rd12136, 1; or.b64 %rd7903, %rd7902, %rd7901; setp.ne.s64 %p1380, %rd7903, 0; @%p1380 bra $L__BB1_1008; mov.b64 {%r3052, %r3053}, %rd2343; mov.b64 {%r3054, %r3055}, %rd2342; mov.b32 %f719, %r3055; mov.b32 %f720, %r3052; setp.eq.s32 %p1381, %r744, 1; @%p1381 bra $L__BB1_1006; bra.uni $L__BB1_979; $L__BB1_1006: ld.local.u64 %rd7982, [%rd2324+8]; cvt.u32.u64 %r3076, %rd7982; mov.b32 %f3262, %r3076; shr.u64 %rd7983, %rd7982, 32; cvt.u32.u64 %r3077, %rd7983; mov.b32 %f3263, %r3077; sub.f32 %f3264, %f518, %f3262; sub.f32 %f3265, %f519, %f3263; mul.f32 %f3266, %f720, %f3265; fma.rn.f32 %f3267, %f719, %f3264, %f3266; setp.le.f32 %p2932, %f3267, 0f00000000; bra.uni $L__BB1_1007; $L__BB1_1013: ld.global.f32 %f3278, [%rd2320+256]; mov.u64 %rd8003, 0; sub.f32 %f3279, %f518, %f3278; ld.global.f32 %f3280, [%rd2320+260]; sub.f32 %f3281, %f519, %f3280; ld.global.f32 %f3282, [%rd2320+252]; ld.global.f32 %f3283, [%rd2320+248]; mul.f32 %f3284, %f3281, %f3282; fma.rn.f32 %f728, %f3279, %f3283, %f3284; mul.f32 %f3285, %f3279, %f3282; mul.f32 %f3286, %f3281, %f3283; sub.f32 %f729, %f3286, %f3285; mov.b32 %r3085, %f728; mov.b32 %r3086, %f729; cvt.u64.u32 %rd8004, %r3086; cvt.u64.u32 %rd8005, %r3085; bfi.b64 %rd8006, %rd8004, %rd8005, 32, 32; st.local.u64 [%rd7861], %rd8006; ld.global.u64 %rd2445, [%rd2320+32]; setp.eq.s64 %p1402, %rd2445, 0; mov.u64 %rd8001, 2; mov.u64 %rd12290, %rd8003; mov.u64 %rd12291, %rd8001; mov.u64 %rd12292, %rd8003; @%p1402 bra $L__BB1_1119; cvta.to.local.u64 %rd2446, %rd7862; mov.u32 %r3093, 0; st.local.u32 [%rd2446], %r3093; mov.u32 %r3094, -16777217; st.local.u32 [%rd2446+4], %r3094; mov.u32 %r766, 1; st.local.u32 [%rd2446+512], %r766; ld.global.u64 %rd2447, [%rd2320+24]; ld.global.u64 %rd2448, [%rd2320+80]; ld.global.u64 %rd2449, [%rd2320+72]; mov.u32 %r764, 2139095039; mov.u32 %r763, 4; bra.uni $L__BB1_1015; $L__BB1_1124: ld.global.f32 %f795, [%rd2320+256]; sub.f32 %f3482, %f518, %f795; ld.global.f32 %f796, [%rd2320+260]; sub.f32 %f3483, %f519, %f796; ld.global.f32 %f3484, [%rd2320+252]; ld.global.f32 %f797, [%rd2320+248]; mul.f32 %f3485, %f3483, %f3484; fma.rn.f32 %f798, %f3482, %f797, %f3485; mul.f32 %f3486, %f3482, %f3484; mul.f32 %f3487, %f3483, %f797; sub.f32 %f799, %f3487, %f3486; mov.b32 %r867, %f798; mov.b32 %r868, %f799; ld.global.u64 %rd2808, [%rd2320+56]; ld.global.u64 %rd2807, [%rd2320+48]; sub.f32 %f3488, %f798, %f6; sub.f32 %f3489, %f799, %f6; mov.b32 %r3265, %f3488; mov.b32 %r3266, %f3489; cvt.u64.u32 %rd8393, %r3266; cvt.u64.u32 %rd8394, %r3265; add.f32 %f3490, %f6, %f798; add.f32 %f3491, %f6, %f799; mov.b32 %r3267, %f3490; mov.b32 %r3268, %f3491; cvt.u64.u32 %rd8395, %r3268; cvt.u64.u32 %rd8396, %r3267; bfi.b64 %rd8397, %rd8393, %rd8394, 32, 32; mov.b64 {%r3269, %r3270}, %rd8397; bfi.b64 %rd8398, %rd8395, %rd8396, 32, 32; mov.b64 {%r3271, %r3272}, %rd8398; cvta.to.local.u64 %rd2809, %rd7862; mov.u16 %rs619, 2; st.local.u8 [%rd2809+8], %rs619; mov.b32 %f803, %r3272; mov.b32 %f801, %r3270; mov.b32 %f802, %r3271; mov.b32 %f800, %r3269; ld.global.v2.f32 {%f3492, %f3493}, [%rd2320+40]; div.rn.f32 %f806, %f800, %f3492; div.rn.f32 %f807, %f802, %f3492; ld.global.u64 %rd2810, [%rd2320+16]; cvt.rn.f32.u64 %f3494, %rd2810; add.f32 %f3495, %f3494, 0fBF800000; rcp.rn.f32 %f808, %f3495; setp.lt.f32 %p1569, %f807, 0fBF000000; setp.gt.f32 %p1570, %f806, 0f3F000000; or.pred %p1571, %p1570, %p1569; @%p1571 bra $L__BB1_1156; add.f32 %f3496, %f806, 0f3F000000; div.rn.f32 %f3497, %f3496, %f808; cvt.rmi.f32.f32 %f3498, %f3497; add.s64 %rd8400, %rd2810, -2; cvt.rn.f32.u64 %f3499, %rd8400; setp.gt.f32 %p1572, %f3498, 0f00000000; setp.lt.f32 %p1573, %f3498, %f3499; selp.f32 %f3500, %f3498, %f3499, %p1573; selp.f32 %f3501, %f3500, 0f00000000, %p1572; setp.gt.f32 %p1574, %f3501, 0f5F7FFFFF; max.f32 %f3502, %f3501, 0f00000000; cvt.rzi.u64.f32 %rd8401, %f3502; selp.b64 %rd2816, -1, %rd8401, %p1574; add.f32 %f3503, %f807, 0f3F000000; div.rn.f32 %f3504, %f3503, %f808; cvt.rpi.f32.f32 %f3505, %f3504; add.s64 %rd8402, %rd2810, -1; cvt.rn.f32.u64 %f3506, %rd8402; setp.gt.f32 %p1575, %f3505, 0f00000000; setp.lt.f32 %p1576, %f3505, %f3506; selp.f32 %f3507, %f3505, %f3506, %p1576; selp.f32 %f3508, %f3507, 0f00000000, %p1575; setp.gt.f32 %p1577, %f3508, 0f5F7FFFFF; max.f32 %f3509, %f3508, 0f00000000; cvt.rzi.u64.f32 %rd8403, %f3509; selp.b64 %rd2812, -1, %rd8403, %p1577; setp.ge.u64 %p1578, %rd2816, %rd2812; @%p1578 bra $L__BB1_1156; div.rn.f32 %f809, %f801, %f3493; div.rn.f32 %f810, %f803, %f3493; ld.global.u64 %rd2813, [%rd2320+32]; ld.global.u64 %rd2814, [%rd2320+24]; ld.global.u64 %rd2815, [%rd2320+8]; and.b32 %r3273, %r867, 2147483647; mov.b32 %f811, %r3273; and.b32 %r3274, %r868, 2147483647; mov.b32 %f812, %r3274; ld.local.v4.u32 {%r5136, %r5137, %r5138, %r3278}, [%rd2809]; mov.f32 %f5408, 0f7F7FFFFF; bra.uni $L__BB1_1127; $L__BB1_1162: ld.global.f32 %f837, [%rd2320+256]; sub.f32 %f3551, %f518, %f837; ld.global.f32 %f838, [%rd2320+260]; sub.f32 %f3552, %f519, %f838; ld.global.f32 %f839, [%rd2320+252]; ld.global.f32 %f840, [%rd2320+248]; mul.f32 %f3553, %f3552, %f839; fma.rn.f32 %f841, %f3551, %f840, %f3553; mul.f32 %f3554, %f3551, %f839; mul.f32 %f3555, %f3552, %f840; sub.f32 %f842, %f3555, %f3554; ld.global.u32 %rd8430, [%rd2320+8]; ld.global.u32 %rd8431, [%rd2320+12]; bfi.b64 %rd8432, %rd8431, %rd8430, 32, 32; cvt.u32.u64 %r3317, %rd8432; mov.b32 %f3556, %r3317; shr.u64 %rd8433, %rd8432, 32; cvt.u32.u64 %r3318, %rd8433; mov.b32 %f3557, %r3318; neg.f32 %f3558, %f3556; neg.f32 %f3559, %f3557; sub.f32 %f843, %f3558, %f841; sub.f32 %f844, %f3559, %f842; sub.f32 %f845, %f841, %f3556; sub.f32 %f846, %f842, %f3557; setp.ge.f32 %p1627, %f843, 0f00000000; selp.f32 %f3560, %f843, 0f00000000, %p1627; setp.ge.f32 %p1628, %f844, 0f00000000; selp.f32 %f3561, %f844, 0f00000000, %p1628; setp.ge.f32 %p1629, %f845, 0f00000000; selp.f32 %f3562, %f845, 0f00000000, %p1629; setp.ge.f32 %p1630, %f846, 0f00000000; selp.f32 %f3563, %f846, 0f00000000, %p1630; sub.f32 %f847, %f3560, %f3562; mov.b32 %r3319, %f847; sub.f32 %f848, %f3561, %f3563; mov.b32 %r3320, %f848; cvt.u64.u32 %rd8434, %r3320; cvt.u64.u32 %rd8435, %r3319; bfi.b64 %rd8436, %rd8434, %rd8435, 32, 32; st.local.u64 [%rd7845], %rd8436; mov.u64 %rd12306, 2; mov.u64 %rd12299, %rd2295; mov.u64 %rd12300, %rd7845; mov.u64 %rd12301, %rd7845; mov.u64 %rd12302, %rd7844; mov.u64 %rd12303, %rd7845; mov.u64 %rd12304, %rd7845; mov.u64 %rd12305, %rd7844; $L__BB1_1163: setp.eq.s64 %p1631, %rd12306, 0; @%p1631 bra $L__BB1_1166; add.s64 %rd12306, %rd12306, -1; add.s64 %rd8437, %rd12303, 8; setp.eq.s64 %p1632, %rd12303, %rd12299; selp.b64 %rd12299, %rd8437, %rd12299, %p1632; add.s64 %rd8438, %rd12300, 8; selp.b64 %rd12300, %rd8438, %rd12300, %p1632; add.s64 %rd8439, %rd12301, 8; selp.b64 %rd12301, %rd8439, %rd12301, %p1632; add.s64 %rd8440, %rd12302, 8; selp.b64 %rd12302, %rd8440, %rd12302, %p1632; selp.b64 %rd8441, %rd8438, %rd12303, %p1632; selp.b64 %rd8442, %rd8439, %rd12304, %p1632; selp.b64 %rd8443, %rd8440, %rd12305, %p1632; setp.eq.s64 %p1633, %rd12306, 0; add.s64 %rd8444, %rd8441, 4; add.s64 %rd8445, %rd8442, 4; add.s64 %rd8446, %rd8443, 4; selp.b64 %rd12303, %rd8441, %rd8444, %p1633; selp.b64 %rd12304, %rd8442, %rd8445, %p1633; selp.b64 %rd12305, %rd8443, %rd8446, %p1633; ld.local.f32 %f3564, [%rd8442]; setp.eq.f32 %p1634, %f3564, 0f00000000; @%p1634 bra $L__BB1_1163; add.f32 %f3565, %f841, %f847; mov.b32 %r3321, %f3565; add.f32 %f3566, %f842, %f848; mov.b32 %r3322, %f3566; cvt.u64.u32 %rd8449, %r3322; cvt.u64.u32 %rd8450, %r3321; bfi.b64 %rd12309, %rd8449, %rd8450, 32, 32; mov.u64 %rd12310, 0; bra.uni $L__BB1_1179; $L__BB1_1166: setp.lt.f32 %p1635, %f843, %f845; mov.f32 %f5409, 0fFF7FFFFF; @%p1635 bra $L__BB1_1169; bra.uni $L__BB1_1167; $L__BB1_1169: setp.leu.f32 %p1640, %f845, 0fFF7FFFFF; mov.pred %p2936, 0; @%p1640 bra $L__BB1_1171; mov.f32 %f5409, %f845; bra.uni $L__BB1_1171; $L__BB1_1167: setp.leu.f32 %p1637, %f843, 0fFF7FFFFF; mov.pred %p2936, 0; @%p1637 bra $L__BB1_1171; mov.pred %p2936, -1; mov.f32 %f5409, %f843; $L__BB1_1171: setp.lt.f32 %p1642, %f844, %f846; @%p1642 bra $L__BB1_1174; bra.uni $L__BB1_1172; $L__BB1_1174: setp.gt.f32 %p1644, %f846, %f5409; @%p1644 bra $L__BB1_1177; bra.uni $L__BB1_1175; $L__BB1_1177: cvta.to.local.u64 %rd8457, %rd7862; mov.u64 %rd8458, 0; st.local.u64 [%rd8457], %rd8458; neg.f32 %f5411, %f846; mov.u64 %rd12308, %rd2304; bra.uni $L__BB1_1178; $L__BB1_1172: setp.leu.f32 %p1643, %f844, %f5409; @%p1643 bra $L__BB1_1175; mov.u64 %rd8453, 0; st.local.u64 [%rd7863], %rd8453; mov.u64 %rd12308, %rd2304; mov.f32 %f5409, %f844; bra.uni $L__BB1_1176; $L__BB1_1175: mov.u64 %rd8455, 0; st.local.u64 [%rd7863], %rd8455; neg.f32 %f5411, %f5409; not.pred %p1645, %p2936; mov.u64 %rd12308, %rd7863; @%p1645 bra $L__BB1_1178; $L__BB1_1176: mov.f32 %f5411, %f5409; $L__BB1_1178: st.local.f32 [%rd12308], %f5411; ld.local.u64 %rd8463, [%rd7863]; cvt.u32.u64 %r3323, %rd8463; mov.b32 %f3569, %r3323; shr.u64 %rd8464, %rd8463, 32; cvt.u32.u64 %r3324, %rd8464; mov.b32 %f3570, %r3324; add.f32 %f3571, %f841, %f3569; add.f32 %f3572, %f842, %f3570; mov.b32 %r3325, %f3571; mov.b32 %r3326, %f3572; cvt.u64.u32 %rd8465, %r3326; cvt.u64.u32 %rd8466, %r3325; bfi.b64 %rd12309, %rd8465, %rd8466, 32, 32; mov.u64 %rd12310, 1; $L__BB1_1179: mov.u64 %rd11173, 0; cvt.u32.u64 %r3327, %rd12309; mov.b32 %f3573, %r3327; shr.u64 %rd8467, %rd12309, 32; cvt.u32.u64 %r3328, %rd8467; mov.b32 %f3574, %r3328; mul.f32 %f3575, %f840, %f3573; mul.f32 %f3576, %f839, %f3574; sub.f32 %f3577, %f3575, %f3576; mul.f32 %f3578, %f840, %f3574; fma.rn.f32 %f3579, %f839, %f3573, %f3578; add.f32 %f3580, %f837, %f3577; mov.b32 %r3329, %f3580; add.f32 %f3581, %f838, %f3579; mov.b32 %r3330, %f3581; cvt.u64.u32 %rd8468, %r3330; cvt.u64.u32 %rd8469, %r3329; bfi.b64 %rd8470, %rd8468, %rd8469, 32, 32; or.b64 %rd8471, %rd11173, %rd8470; mov.b64 {%r5139, %r5140}, %rd8471; mov.b64 {%r5141, %r3331}, %rd12310; bra.uni $L__BB1_1180; $L__BB1_1144: sub.f32 %f3522, %f5406, %f798; abs.f32 %f830, %f3522; setp.le.f32 %p1597, %f830, 0f34000000; @%p1597 bra $L__BB1_1146; abs.f32 %f3523, %f5406; abs.f32 %f3524, %f798; setp.gt.f32 %p1599, %f3524, %f3523; selp.f32 %f3525, %f3524, %f3523, %p1599; mul.f32 %f3526, %f3525, 0f34000000; setp.gtu.f32 %p1600, %f830, %f3526; @%p1600 bra $L__BB1_1150; bra.uni $L__BB1_1146; $L__BB1_1127: setp.gt.u64 %p1579, %rd2813, %rd2816; @%p1579 bra $L__BB1_1129; bra.uni $L__BB1_1128; $L__BB1_1129: add.s64 %rd8404, %rd2814, %rd2816; ld.u8 %rs620, [%rd8404]; setp.eq.s16 %p1580, %rs620, 0; @%p1580 bra $L__BB1_1154; cvt.rn.f32.u64 %f3511, %rd2816; fma.rn.f32 %f814, %f808, %f3511, 0fBF000000; setp.gt.u64 %p1581, %rd2810, %rd2816; @%p1581 bra $L__BB1_1132; bra.uni $L__BB1_1131; $L__BB1_1132: shl.b64 %rd8405, %rd2816, 2; add.s64 %rd2817, %rd2815, %rd8405; ld.f32 %f815, [%rd2817]; add.s64 %rd8406, %rd2816, 1; setp.gt.u64 %p1582, %rd2810, %rd8406; @%p1582 bra $L__BB1_1134; bra.uni $L__BB1_1133; $L__BB1_1134: ld.f32 %f816, [%rd2817+4]; setp.gt.f32 %p1583, %f816, %f810; setp.gt.f32 %p1584, %f815, %f810; and.pred %p1585, %p1584, %p1583; @%p1585 bra $L__BB1_1154; setp.lt.f32 %p1586, %f815, %f809; setp.lt.f32 %p1587, %f816, %f809; and.pred %p1588, %p1586, %p1587; @%p1588 bra $L__BB1_1154; mul.f32 %f3512, %f3492, %f814; mov.b32 %r3279, %f3512; mul.f32 %f819, %f3493, %f815; mov.b32 %r3280, %f819; cvt.u64.u32 %rd8407, %r3280; cvt.u64.u32 %rd8408, %r3279; add.f32 %f3513, %f808, %f814; mul.f32 %f817, %f3492, %f3513; mov.b32 %r875, %f817; mul.f32 %f3514, %f3493, %f816; mov.b32 %r3281, %f3514; cvt.u64.u32 %rd8409, %r3281; cvt.u64.u32 %rd8410, %r875; bfi.b64 %rd8411, %rd8409, %rd8410, 32, 32; bfi.b64 %rd8412, %rd8407, %rd8408, 32, 32; cvt.u32.u64 %r5134, %rd8412; mov.b32 %f5406, %r5134; sub.f32 %f820, %f817, %f5406; sub.f32 %f821, %f3514, %f819; sub.f32 %f3515, %f798, %f5406; sub.f32 %f3516, %f799, %f819; mul.f32 %f3517, %f821, %f3516; fma.rn.f32 %f822, %f820, %f3515, %f3517; mul.f32 %f3518, %f821, %f821; fma.rn.f32 %f3519, %f820, %f820, %f3518; add.f32 %f823, %f3519, 0f00000000; setp.gtu.f32 %p1589, %f822, 0f00000000; mov.b64 {%r3282, %r5135}, %rd8412; mov.b64 {%r3283, %r878}, %rd8411; @%p1589 bra $L__BB1_1138; bra.uni $L__BB1_1137; $L__BB1_1138: setp.ltu.f32 %p1590, %f822, %f823; @%p1590 bra $L__BB1_1140; bra.uni $L__BB1_1139; $L__BB1_1140: setp.eq.f32 %p1591, %f823, 0f00000000; @%p1591 bra $L__BB1_1153; div.rn.f32 %f3520, %f822, %f823; fma.rn.f32 %f5406, %f820, %f3520, %f5406; mov.b32 %r5134, %f5406; fma.rn.f32 %f5407, %f821, %f3520, %f819; mov.b32 %r5135, %f5407; bra.uni $L__BB1_1142; $L__BB1_1137: mov.b32 %f5407, %r5135; bra.uni $L__BB1_1142; $L__BB1_1139: mov.b32 %f5407, %r878; mov.f32 %f5406, %f817; mov.u32 %r5134, %r875; mov.u32 %r5135, %r878; $L__BB1_1142: setp.eq.f32 %p1592, %f798, %f5406; @%p1592 bra $L__BB1_1146; bra.uni $L__BB1_1143; $L__BB1_1146: setp.eq.f32 %p1602, %f5407, %f799; mov.pred %p1601, -1; mov.pred %p2934, %p1601; @%p1602 bra $L__BB1_1150; setp.eq.f32 %p1604, %f812, 0f7F800000; and.b32 %r3285, %r5135, 2147483647; mov.b32 %f3527, %r3285; setp.eq.f32 %p1605, %f3527, 0f7F800000; or.pred %p1606, %p1604, %p1605; mov.pred %p2934, 0; @%p1606 bra $L__BB1_1150; sub.f32 %f3528, %f5407, %f799; abs.f32 %f831, %f3528; setp.le.f32 %p1608, %f831, 0f34000000; mov.pred %p2934, %p1601; @%p1608 bra $L__BB1_1150; abs.f32 %f3529, %f5407; abs.f32 %f3530, %f799; setp.gt.f32 %p1609, %f3530, %f3529; selp.f32 %f3531, %f3530, %f3529, %p1609; mul.f32 %f3532, %f3531, 0f34000000; setp.le.f32 %p2934, %f831, %f3532; bra.uni $L__BB1_1150; $L__BB1_1143: setp.eq.f32 %p1594, %f811, 0f7F800000; and.b32 %r3284, %r5134, 2147483647; mov.b32 %f3521, %r3284; setp.eq.f32 %p1595, %f3521, 0f7F800000; or.pred %p1596, %p1594, %p1595; mov.pred %p2934, 0; @%p1596 bra $L__BB1_1150; bra.uni $L__BB1_1144; $L__BB1_1150: cvt.u64.u32 %rd8413, %r5135; cvt.u64.u32 %rd8414, %r5134; bfi.b64 %rd2818, %rd8413, %rd8414, 32, 32; mov.b64 {%r3286, %r3287}, %rd2818; selp.u64 %rd2819, 1, 0, %p2934; mov.b32 %f3533, %r3286; sub.f32 %f3534, %f3533, %f798; mov.b32 %f3535, %r3287; sub.f32 %f3536, %f3535, %f799; mul.f32 %f3537, %f3536, %f3536; fma.rn.f32 %f3538, %f3534, %f3534, %f3537; add.f32 %f832, %f3538, 0f00000000; setp.geu.f32 %p1610, %f832, %f5408; @%p1610 bra $L__BB1_1154; sqrt.rn.f32 %f3539, %f832; setp.gtu.f32 %p1611, %f3539, %f6; mov.f32 %f5408, %f832; @%p1611 bra $L__BB1_1154; mov.b64 {%r5138, %r3288}, %rd2819; mov.u32 %r5136, %r3286; mov.u32 %r5137, %r3287; mov.f32 %f5408, %f832; $L__BB1_1154: add.s64 %rd2816, %rd2816, 1; setp.lt.u64 %p1612, %rd2816, %rd2812; @%p1612 bra $L__BB1_1127; st.local.u32 [%rd2809+8], %r5138; mov.b64 %rd8415, {%r5136, %r5137}; st.local.u64 [%rd2809], %rd8415; $L__BB1_1156: cvt.u64.u32 %rd8416, %r867; cvt.u64.u32 %rd8417, %r868; bfi.b64 %rd2821, %rd8417, %rd8416, 32, 32; ld.local.v4.u32 {%r3292, %r3293, %r3294, %r3295}, [%rd2809]; mov.b64 %rd2823, {%r3294, %r3295}; mov.b64 %rd2822, {%r3292, %r3293}; mov.b32 {%rs621, %rs622}, %r3294; and.b16 %rs623, %rs621, 255; setp.eq.s16 %p1613, %rs623, 2; cvt.u64.u16 %rd8418, %rs621; and.b64 %rd8419, %rd8418, 255; selp.b64 %rd8420, 2, %rd8419, %p1613; and.b64 %rd8421, %rd2823, 4294967040; or.b64 %rd8422, %rd8421, %rd8420; mov.b64 {%r3300, %r3301}, %rd8422; mov.b32 {%rs1030, %rs624}, %r3300; and.b16 %rs625, %rs1030, 255; setp.eq.s16 %p1614, %rs625, 2; mov.u32 %r5141, 2; mov.u32 %r5139, 0; mov.u32 %r5140, %r5139; @%p1614 bra $L__BB1_1180; ld.global.u8 %rs626, [%rd2320+64]; setp.eq.s16 %p1615, %rs626, 0; shr.u64 %rd8423, %rd2822, 32; cvt.u32.u64 %r3302, %rd8423; mov.b32 %f834, %r3302; @%p1615 bra $L__BB1_1161; mov.b64 {%r3303, %r3304}, %rd2821; mov.b32 %f836, %r3304; mov.b32 %f835, %r3303; mov.b64 {%r3305, %r3306}, %rd2807; mov.b64 {%r3307, %r3308}, %rd2808; ld.global.u8 %rs113, [%rd2320+65]; mov.b32 %f3540, %r3307; setp.gt.f32 %p1617, %f835, %f3540; mov.b32 %f3541, %r3305; setp.lt.f32 %p1618, %f835, %f3541; or.pred %p1619, %p1618, %p1617; mov.pred %p2935, 0; @%p1619 bra $L__BB1_1160; setp.geu.f32 %p1620, %f836, 0fFF7FFFFF; setp.leu.f32 %p1621, %f836, 0f7F7FFFFF; and.pred %p2935, %p1621, %p1620; $L__BB1_1160: setp.ge.f32 %p1622, %f799, %f834; setp.le.f32 %p1623, %f799, %f834; setp.eq.s16 %p1624, %rs113, 0; selp.u32 %r3309, -1, 0, %p1622; selp.u32 %r3310, -1, 0, %p1623; selp.b32 %r3311, %r3310, %r3309, %p1624; and.b32 %r3312, %r3311, 1; setp.eq.b32 %p1625, %r3312, 1; and.pred %p1626, %p1625, %p2935; selp.u16 %rs1030, 1, 0, %p1626; $L__BB1_1161: cvt.u32.u64 %r3313, %rd2822; mov.b32 %f3542, %r3313; mul.f32 %f3543, %f797, %f3542; ld.global.f32 %f3544, [%rd2320+252]; mul.f32 %f3545, %f3544, %f834; sub.f32 %f3546, %f3543, %f3545; mul.f32 %f3547, %f3544, %f3542; fma.rn.f32 %f3548, %f797, %f834, %f3547; add.f32 %f3549, %f795, %f3546; mov.b32 %r3314, %f3549; add.f32 %f3550, %f796, %f3548; mov.b32 %r3315, %f3550; cvt.u64.u32 %rd8424, %r3315; cvt.u64.u32 %rd8425, %r3314; cvt.u64.u16 %rd8426, %rs1030; bfi.b64 %rd8427, %rd8424, %rd8425, 32, 32; and.b64 %rd8428, %rd8426, 255; mov.b64 {%r5139, %r5140}, %rd8427; mov.b64 {%r5141, %r3316}, %rd8428; bra.uni $L__BB1_1180; $L__BB1_979: ld.local.u32 %r3056, [%rd2324+24]; setp.eq.s32 %p1382, %r3056, 0; @%p1382 bra $L__BB1_992; setp.ne.s32 %p1383, %r3056, 1; @%p1383 bra $L__BB1_1005; add.s64 %rd2344, %rd12133, 1; or.b64 %rd7904, %rd2344, %rd2325; and.b64 %rd7905, %rd7904, -4294967296; setp.eq.s64 %p1384, %rd7905, 0; @%p1384 bra $L__BB1_983; rem.u64 %rd12137, %rd2344, %rd2325; bra.uni $L__BB1_984; $L__BB1_992: setp.eq.s64 %p1391, %rd12133, 0; selp.b64 %rd2391, %rd2325, %rd12133, %p1391; add.s64 %rd7944, %rd2391, -1; setp.gt.u64 %p1392, %rd2325, %rd7944; @%p1392 bra $L__BB1_994; bra.uni $L__BB1_993; $L__BB1_994: shl.b64 %rd7945, %rd2391, 3; add.s64 %rd7946, %rd2326, %rd7945; ld.u32 %rd7947, [%rd7946+-8]; ld.u32 %rd7948, [%rd7946+-4]; bfi.b64 %rd2392, %rd7948, %rd7947, 32, 32; or.b64 %rd7949, %rd2391, %rd2325; and.b64 %rd7950, %rd7949, -4294967296; setp.eq.s64 %p1393, %rd7950, 0; @%p1393 bra $L__BB1_996; rem.u64 %rd12154, %rd2391, %rd2325; bra.uni $L__BB1_997; $L__BB1_1110: ld.u32 %r3242, [%rd2457+76]; cvt.u64.u32 %rd8332, %r3242; setp.le.u64 %p1559, %rd2448, %rd8332; mul.wide.u32 %rd8333, %r3242, 12; add.s64 %rd8334, %rd2449, %rd8333; setp.eq.s64 %p1560, %rd8334, 0; or.pred %p1561, %p1559, %p1560; selp.b32 %r761, %r761, %r5095, %p1561; selp.b32 %r760, %r760, %r5094, %p1561; selp.b32 %r759, %r759, %r5093, %p1561; selp.b32 %r763, %r763, %r5108, %p1561; selp.b32 %r764, %r764, %r813, %p1561; $L__BB1_1015: mov.b32 %f730, %r764; $L__BB1_1016: mov.u32 %r765, %r766; setp.eq.s32 %p1403, %r765, 0; @%p1403 bra $L__BB1_1117; cvt.u64.u32 %rd8013, %r765; add.s64 %rd8014, %rd8013, -1; cvt.u32.u64 %r766, %rd8014; st.local.u32 [%rd2446+512], %r766; mul.wide.u32 %rd8015, %r765, 8; add.s64 %rd8016, %rd2446, %rd8015; ld.local.u32 %rd2455, [%rd8016+-4]; ld.local.u32 %rd8017, [%rd8016+-8]; shl.b64 %rd8018, %rd8017, 32; or.b64 %rd2454, %rd8018, 1; mov.b64 {%r3098, %r3099}, %rd2455; mov.b32 %f3287, %r3098; neg.f32 %f3288, %f3287; setp.le.f32 %p1404, %f730, %f3288; @%p1404 bra $L__BB1_1016; mov.b64 {%r3100, %r3101}, %rd2454; cvt.u64.u32 %rd2456, %r3101; setp.gt.u64 %p1405, %rd2445, %rd2456; @%p1405 bra $L__BB1_1020; bra.uni $L__BB1_1019; $L__BB1_1020: mul.lo.s64 %rd8019, %rd2456, 96; add.s64 %rd2457, %rd2447, %rd8019; ld.u8 %rs561, [%rd2457+88]; and.b16 %rs562, %rs561, 1; setp.eq.b16 %p1407, %rs562, 1; mov.pred %p2933, 0; xor.pred %p1408, %p1407, %p2933; not.pred %p1409, %p1408; @%p1409 bra $L__BB1_1022; ld.v4.u32 {%r3102, %r3103, %r3104, %r3105}, [%rd2457+64]; cvt.u64.u32 %rd8020, %r3102; setp.gt.u64 %p1411, %rd2448, %rd8020; mul.wide.u32 %rd8021, %r3102, 12; add.s64 %rd8022, %rd2449, %rd8021; selp.b64 %rd8023, %rd8022, 0, %p1411; setp.eq.s64 %p1412, %rd8023, 0; add.s64 %rd8024, %rd8023, 8; selp.b64 %rd12175, 0, %rd8024, %p1412; cvt.u64.u32 %rd8025, %r3103; setp.gt.u64 %p1413, %rd2448, %rd8025; mul.wide.u32 %rd8026, %r3103, 12; add.s64 %rd8027, %rd2449, %rd8026; selp.b64 %rd8028, %rd8027, 0, %p1413; setp.eq.s64 %p1414, %rd8028, 0; add.s64 %rd8029, %rd8028, 8; selp.b64 %rd12174, 0, %rd8029, %p1414; ld.u32 %r3109, [%rd2457+72]; cvt.u64.u32 %rd8030, %r3109; setp.gt.u64 %p1415, %rd2448, %rd8030; mul.wide.u32 %rd8031, %r3109, 12; add.s64 %rd8032, %rd2449, %rd8031; selp.b64 %rd8033, %rd8032, 0, %p1415; setp.eq.s64 %p1416, %rd8033, 0; add.s64 %rd8034, %rd8033, 8; selp.b64 %rd12173, 0, %rd8034, %p1416; cvt.u64.u32 %rd8035, %r3105; setp.gt.u64 %p1417, %rd2448, %rd8035; mul.wide.u32 %rd8036, %r3105, 12; add.s64 %rd8037, %rd2449, %rd8036; selp.b64 %rd8038, %rd8037, 0, %p1417; setp.eq.s64 %p1418, %rd8038, 0; add.s64 %rd8039, %rd8038, 8; selp.b64 %rd12172, 0, %rd8039, %p1418; mov.pred %p2933, -1; $L__BB1_1022: ld.v4.f32 {%f3289, %f3290, %f3291, %f3292}, [%rd2457]; sub.f32 %f3297, %f3289, %f728; sub.f32 %f3298, %f3290, %f728; sub.f32 %f3299, %f3291, %f728; sub.f32 %f3300, %f3292, %f728; ld.v4.f32 {%f3301, %f3302, %f3303, %f3304}, [%rd2457+16]; sub.f32 %f3309, %f3301, %f729; sub.f32 %f3310, %f3302, %f729; sub.f32 %f3311, %f3303, %f729; sub.f32 %f3312, %f3304, %f729; ld.v4.f32 {%f3313, %f3314, %f3315, %f3316}, [%rd2457+32]; sub.f32 %f3321, %f728, %f3313; sub.f32 %f3322, %f728, %f3314; sub.f32 %f3323, %f728, %f3315; sub.f32 %f3324, %f728, %f3316; ld.v4.f32 {%f3325, %f3326, %f3327, %f3328}, [%rd2457+48]; sub.f32 %f3333, %f729, %f3325; sub.f32 %f3334, %f729, %f3326; sub.f32 %f3335, %f729, %f3327; sub.f32 %f3336, %f729, %f3328; setp.ge.f32 %p1419, %f3297, %f3321; selp.f32 %f3337, %f3297, %f3321, %p1419; setp.ge.f32 %p1420, %f3298, %f3322; selp.f32 %f3338, %f3298, %f3322, %p1420; setp.ge.f32 %p1421, %f3299, %f3323; selp.f32 %f3339, %f3299, %f3323, %p1421; setp.ge.f32 %p1422, %f3300, %f3324; selp.f32 %f3340, %f3300, %f3324, %p1422; setp.ge.f32 %p1423, %f3309, %f3333; selp.f32 %f3341, %f3309, %f3333, %p1423; setp.ge.f32 %p1424, %f3310, %f3334; selp.f32 %f3342, %f3310, %f3334, %p1424; setp.ge.f32 %p1425, %f3311, %f3335; selp.f32 %f3343, %f3311, %f3335, %p1425; setp.ge.f32 %p1426, %f3312, %f3336; selp.f32 %f3344, %f3312, %f3336, %p1426; setp.ge.f32 %p1427, %f3337, 0f00000000; selp.f32 %f3345, %f3337, 0f00000000, %p1427; setp.ge.f32 %p1428, %f3338, 0f00000000; selp.f32 %f3346, %f3338, 0f00000000, %p1428; setp.ge.f32 %p1429, %f3339, 0f00000000; selp.f32 %f3347, %f3339, 0f00000000, %p1429; setp.ge.f32 %p1430, %f3340, 0f00000000; selp.f32 %f3348, %f3340, 0f00000000, %p1430; mov.b32 %r3110, %f3345; mov.b32 %r3111, %f3346; mov.b32 %r3112, %f3347; mov.b32 %r3113, %f3348; cvt.u64.u32 %rd8040, %r3113; cvt.u64.u32 %rd8041, %r3111; cvt.u64.u32 %rd8042, %r3110; cvt.u64.u32 %rd8043, %r3112; bfi.b64 %rd8044, %rd8040, %rd8043, 32, 32; bfi.b64 %rd8045, %rd8041, %rd8042, 32, 32; setp.ge.f32 %p1431, %f3341, 0f00000000; selp.f32 %f3349, %f3341, 0f00000000, %p1431; setp.ge.f32 %p1432, %f3342, 0f00000000; selp.f32 %f3350, %f3342, 0f00000000, %p1432; setp.ge.f32 %p1433, %f3343, 0f00000000; selp.f32 %f3351, %f3343, 0f00000000, %p1433; setp.ge.f32 %p1434, %f3344, 0f00000000; selp.f32 %f3352, %f3344, 0f00000000, %p1434; mov.b32 %r3114, %f3349; mov.b32 %r3115, %f3350; mov.b32 %r3116, %f3351; mov.b32 %r3117, %f3352; cvt.u64.u32 %rd8046, %r3117; cvt.u64.u32 %rd8047, %r3115; cvt.u64.u32 %rd8048, %r3114; cvt.u64.u32 %rd8049, %r3116; bfi.b64 %rd8050, %rd8046, %rd8049, 32, 32; bfi.b64 %rd8051, %rd8047, %rd8048, 32, 32; mov.b64 {%r3118, %r3119}, %rd8045; mov.b64 {%r3120, %r3121}, %rd8044; cvt.u64.u32 %rd8052, %r3121; cvt.u64.u32 %rd8053, %r3119; cvt.u64.u32 %rd8054, %r3120; bfi.b64 %rd8055, %rd8052, %rd8054, 32, 32; mov.b64 {%r3122, %r3123}, %rd8055; bfi.b64 %rd8056, %rd8053, %rd8042, 32, 32; mov.b64 {%r3124, %r3125}, %rd8056; mov.b32 %f3353, %r3124; mov.b32 %f3354, %r3125; mov.b32 %f3355, %r3122; mov.b32 %f3356, %r3123; mov.b32 %f3357, %r3118; mov.b32 %f3358, %r3119; mov.b32 %f3359, %r3120; mov.b32 %f3360, %r3121; mov.b64 {%r3126, %r3127}, %rd8051; mov.b64 {%r3128, %r3129}, %rd8050; cvt.u64.u32 %rd8057, %r3129; cvt.u64.u32 %rd8058, %r3127; cvt.u64.u32 %rd8059, %r3128; bfi.b64 %rd8060, %rd8057, %rd8059, 32, 32; mov.b64 {%r3130, %r3131}, %rd8060; bfi.b64 %rd8061, %rd8058, %rd8048, 32, 32; mov.b64 {%r3132, %r3133}, %rd8061; mov.b32 %f3361, %r3132; mov.b32 %f3362, %r3133; mov.b32 %f3363, %r3130; mov.b32 %f3364, %r3131; mov.b32 %f3365, %r3126; mov.b32 %f3366, %r3127; mov.b32 %f3367, %r3128; mov.b32 %f3368, %r3129; mul.f32 %f3369, %f3365, %f3361; mul.f32 %f3370, %f3366, %f3362; mul.f32 %f3371, %f3367, %f3363; mul.f32 %f3372, %f3368, %f3364; fma.rn.f32 %f3373, %f3357, %f3353, %f3369; fma.rn.f32 %f3374, %f3358, %f3354, %f3370; fma.rn.f32 %f3375, %f3359, %f3355, %f3371; fma.rn.f32 %f3376, %f3360, %f3356, %f3372; add.f32 %f3377, %f3373, 0f00000000; add.f32 %f3378, %f3374, 0f00000000; add.f32 %f3379, %f3375, 0f00000000; add.f32 %f3380, %f3376, 0f00000000; sqrt.rn.f32 %f3381, %f3377; sqrt.rn.f32 %f3382, %f3378; sqrt.rn.f32 %f3383, %f3379; sqrt.rn.f32 %f3384, %f3380; mov.b32 %r3134, %f3381; mov.b32 %r3135, %f3382; mov.b32 %r3136, %f3383; mov.b32 %r3137, %f3384; cvt.u64.u32 %rd8062, %r3137; cvt.u64.u32 %rd8063, %r3135; cvt.u64.u32 %rd8064, %r3134; cvt.u64.u32 %rd8065, %r3136; bfi.b64 %rd12281, %rd8062, %rd8065, 32, 32; mov.b64 {%r3138, %r3139}, %rd12281; bfi.b64 %rd12280, %rd8063, %rd8064, 32, 32; mov.b64 {%r3140, %r3141}, %rd12280; mov.b32 %f3385, %r3140; mov.b32 %f3386, %r3141; mov.b32 %f3387, %r3138; mov.b32 %f3388, %r3139; setp.lt.f32 %p1435, %f3385, %f730; setp.lt.f32 %p1436, %f3386, %f730; setp.lt.f32 %p1437, %f3387, %f730; setp.lt.f32 %p1438, %f3388, %f730; selp.u32 %r3142, 1, 0, %p1435; selp.u32 %r3143, -1, 0, %p1436; bfi.b32 %r3144, %r3143, %r3142, 8, 1; selp.u32 %r3145, -1, 0, %p1437; bfi.b32 %r3146, %r3145, %r3144, 16, 1; selp.u32 %r3147, -1, 0, %p1438; bfi.b32 %r3148, %r3147, %r3146, 24, 1; cvt.u64.u32 %rd8066, %r3148; mov.b64 {%r3149, %r3150}, %rd8066; mov.b32 {%rs563, %rs564}, %r3149; and.b16 %rs565, %rs563, 1; shr.u16 %rs566, %rs563, 7; and.b16 %rs567, %rs566, 2; or.b16 %rs568, %rs567, %rs565; shl.b16 %rs569, %rs564, 2; and.b16 %rs570, %rs569, 4; or.b16 %rs571, %rs568, %rs570; shr.u16 %rs572, %rs564, 5; and.b16 %rs573, %rs572, 8; or.b16 %rs574, %rs571, %rs573; cvt.u64.u16 %rd2468, %rs574; @%p2933 bra $L__BB1_1024; bra.uni $L__BB1_1023; $L__BB1_1024: mov.u64 %rd8067, 1; st.local.v2.u64 [%rd8], {%rd12175, %rd12174}; st.local.v2.u64 [%rd8+16], {%rd12173, %rd12172}; mov.f32 %f3389, 0f00000000; st.local.v4.f32 [%rd24], {%f3389, %f3389, %f3389, %f3389}; mov.u32 %r3161, 4; st.local.u32 [%rd7+16], %r3161; st.local.u32 [%rd7+52], %r3161; st.local.u32 [%rd7+88], %r3161; st.local.u32 [%rd7+124], %r3161; mov.u64 %rd2472, %rd8067; $L__BB1_1025: add.s64 %rd8071, %rd2472, -1; cvt.u32.u64 %r3162, %rd8071; shl.b64 %rd8073, %rd8067, %r3162; and.b64 %rd8074, %rd8073, %rd2468; setp.eq.s64 %p1439, %rd8074, 0; @%p1439 bra $L__BB1_1078; shl.b64 %rd8075, %rd2472, 3; add.s64 %rd8076, %rd8, %rd8075; ld.local.u64 %rd2473, [%rd8076+-8]; setp.eq.s64 %p1440, %rd2473, 0; @%p1440 bra $L__BB1_1078; ld.u32 %r767, [%rd2473]; cvt.u64.u32 %rd2474, %r767; ld.global.u64 %rd8077, [%rd2320+112]; setp.gt.u64 %p1441, %rd8077, %rd2474; @%p1441 bra $L__BB1_1029; bra.uni $L__BB1_1028; $L__BB1_1029: ld.global.u64 %rd8078, [%rd2320+104]; mul.lo.s64 %rd8079, %rd2474, 12; add.s64 %rd2475, %rd8078, %rd8079; ld.u32 %rd2476, [%rd2475+8]; ld.u32 %rd2477, [%rd2475]; ld.global.u64 %rd2478, [%rd2320+96]; setp.gt.u64 %p1442, %rd2478, %rd2477; @%p1442 bra $L__BB1_1031; bra.uni $L__BB1_1030; $L__BB1_1031: ld.global.u64 %rd2479, [%rd2320+88]; shl.b64 %rd8080, %rd2477, 3; add.s64 %rd8081, %rd2479, %rd8080; ld.u32 %rd8082, [%rd8081]; ld.u32 %rd8083, [%rd8081+4]; bfi.b64 %rd2480, %rd8083, %rd8082, 32, 32; ld.u32 %rd2481, [%rd2475+4]; setp.gt.u64 %p1443, %rd2478, %rd2481; @%p1443 bra $L__BB1_1033; bra.uni $L__BB1_1032; $L__BB1_1033: setp.gt.u64 %p1444, %rd2478, %rd2476; @%p1444 bra $L__BB1_1035; bra.uni $L__BB1_1034; $L__BB1_1035: shl.b64 %rd8084, %rd2481, 3; add.s64 %rd8085, %rd2479, %rd8084; shl.b64 %rd8086, %rd2476, 3; add.s64 %rd8087, %rd2479, %rd8086; cvt.u32.u64 %r3163, %rd2480; mov.b32 %f731, %r3163; shr.u64 %rd8088, %rd2480, 32; cvt.u32.u64 %r3164, %rd8088; mov.b32 %f732, %r3164; ld.u32 %rd8089, [%rd8085]; ld.u32 %rd8090, [%rd8085+4]; bfi.b64 %rd2482, %rd8090, %rd8089, 32, 32; cvt.u32.u64 %r3165, %rd2482; shr.u64 %rd8091, %rd2482, 32; cvt.u32.u64 %r3166, %rd8091; mov.b32 %f733, %r3165; sub.f32 %f734, %f733, %f731; mov.b32 %f5400, %r3166; sub.f32 %f736, %f5400, %f732; ld.u32 %rd8092, [%rd8087]; ld.u32 %rd8093, [%rd8087+4]; bfi.b64 %rd2483, %rd8093, %rd8092, 32, 32; cvt.u32.u64 %r3167, %rd2483; shr.u64 %rd8094, %rd2483, 32; cvt.u32.u64 %r3168, %rd8094; mov.b32 %f737, %r3167; sub.f32 %f738, %f737, %f731; mov.b32 %f739, %r3168; sub.f32 %f740, %f739, %f732; sub.f32 %f741, %f728, %f731; sub.f32 %f742, %f729, %f732; mul.f32 %f3390, %f736, %f742; fma.rn.f32 %f743, %f734, %f741, %f3390; mul.f32 %f3391, %f740, %f742; fma.rn.f32 %f744, %f738, %f741, %f3391; setp.le.f32 %p1445, %f743, 0f00000000; setp.le.f32 %p1446, %f744, 0f00000000; and.pred %p1447, %p1445, %p1446; @%p1447 bra $L__BB1_1073; bra.uni $L__BB1_1036; $L__BB1_1073: add.u64 %rd12266, %SP, 552; cvta.to.local.u64 %rd12264, %rd12266; add.u64 %rd12272, %SP, 0; cvta.to.local.u64 %rd12270, %rd12272; st.local.u64 [%rd12270], %rd2480; mov.u64 %rd12277, 2; mov.u64 %rd12263, %rd2302; mov.u64 %rd12265, %rd12264; mov.u64 %rd12267, %rd12264; mov.u64 %rd12268, %rd12264; mov.u64 %rd12269, %rd12266; mov.u64 %rd12271, %rd12270; mov.u64 %rd12273, %rd12270; mov.u64 %rd12274, %rd12270; mov.u64 %rd12275, %rd12272; mov.u64 %rd12276, %rd2296; $L__BB1_1074: setp.eq.s64 %p1500, %rd12277, 0; mov.u64 %rd12278, 1; @%p1500 bra $L__BB1_1076; add.s64 %rd12277, %rd12277, -1; add.s64 %rd8239, %rd12264, 8; setp.eq.s64 %p1501, %rd12267, %rd12263; selp.b64 %rd8240, %rd8239, %rd12267, %p1501; add.s64 %rd8241, %rd12265, 8; selp.b64 %rd8242, %rd8241, %rd12268, %p1501; add.s64 %rd8243, %rd12266, 8; selp.b64 %rd8244, %rd8243, %rd12269, %p1501; mov.u64 %rd12278, 0; setp.eq.s64 %p1502, %rd12277, 0; add.s64 %rd8245, %rd8240, 4; add.s64 %rd8246, %rd8242, 4; add.s64 %rd8247, %rd8244, 4; selp.b64 %rd2709, %rd8240, %rd8245, %p1502; selp.b64 %rd12268, %rd8242, %rd8246, %p1502; selp.b64 %rd12269, %rd8244, %rd8247, %p1502; selp.b64 %rd12264, %rd8239, %rd12264, %p1501; selp.b64 %rd12265, %rd8241, %rd12265, %p1501; selp.b64 %rd12266, %rd8243, %rd12266, %p1501; add.s64 %rd8248, %rd12267, 8; selp.b64 %rd12263, %rd8248, %rd12263, %p1501; add.s64 %rd8249, %rd12273, 8; setp.eq.s64 %p1503, %rd12270, %rd12276; selp.b64 %rd8250, %rd8249, %rd12270, %p1503; add.s64 %rd8251, %rd12274, 8; selp.b64 %rd8252, %rd8251, %rd12271, %p1503; add.s64 %rd8253, %rd12275, 8; selp.b64 %rd8254, %rd8253, %rd12272, %p1503; selp.b64 %rd12273, %rd8249, %rd12273, %p1503; selp.b64 %rd12274, %rd8251, %rd12274, %p1503; selp.b64 %rd12275, %rd8253, %rd12275, %p1503; add.s64 %rd8255, %rd12270, 8; selp.b64 %rd12276, %rd8255, %rd12276, %p1503; add.s64 %rd8256, %rd8250, 4; add.s64 %rd8257, %rd8252, 4; add.s64 %rd8258, %rd8254, 4; selp.b64 %rd12270, %rd8250, %rd8256, %p1502; selp.b64 %rd12271, %rd8252, %rd8257, %p1502; selp.b64 %rd12272, %rd8254, %rd8258, %p1502; ld.local.f32 %f3457, [%rd8252]; ld.local.f32 %f3458, [%rd8242]; setp.eq.f32 %p1504, %f3458, %f3457; mov.u64 %rd12267, %rd2709; @%p1504 bra $L__BB1_1074; $L__BB1_1076: mov.u64 %rd11150, 0; or.b64 %rd8260, %rd11150, %rd2480; mov.b64 {%r3210, %r3211}, %rd8260; mov.b64 {%r3212, %r3213}, %rd12278; cvt.u32.u64 %r3215, %rd11150; or.b32 %r5090, %r3215, %r3163; mov.u32 %r5091, 0; mov.b32 %f5404, %r3211; mov.b32 {%rs1029, %rs593}, %r3212; mov.u32 %r5092, %r5091; bra.uni $L__BB1_1077; $L__BB1_1036: sub.f32 %f745, %f728, %f733; sub.f32 %f746, %f729, %f5400; mul.f32 %f3392, %f736, %f746; fma.rn.f32 %f747, %f734, %f745, %f3392; mul.f32 %f3393, %f740, %f746; fma.rn.f32 %f748, %f738, %f745, %f3393; setp.ge.f32 %p1448, %f747, 0f00000000; setp.le.f32 %p1449, %f748, %f747; and.pred %p1450, %p1449, %p1448; @%p1450 bra $L__BB1_1069; bra.uni $L__BB1_1037; $L__BB1_1069: add.u64 %rd12250, %SP, 552; cvta.to.local.u64 %rd12248, %rd12250; add.u64 %rd12256, %SP, 0; cvta.to.local.u64 %rd12254, %rd12256; st.local.u64 [%rd12254], %rd2482; mov.u64 %rd12261, 2; mov.u64 %rd12247, %rd2302; mov.u64 %rd12249, %rd12248; mov.u64 %rd12251, %rd12248; mov.u64 %rd12252, %rd12248; mov.u64 %rd12253, %rd12250; mov.u64 %rd12255, %rd12254; mov.u64 %rd12257, %rd12254; mov.u64 %rd12258, %rd12254; mov.u64 %rd12259, %rd12256; mov.u64 %rd12260, %rd2297; $L__BB1_1070: setp.eq.s64 %p1495, %rd12261, 0; mov.u64 %rd12262, 1; @%p1495 bra $L__BB1_1072; add.s64 %rd12261, %rd12261, -1; add.s64 %rd8212, %rd12248, 8; setp.eq.s64 %p1496, %rd12251, %rd12247; selp.b64 %rd8213, %rd8212, %rd12251, %p1496; add.s64 %rd8214, %rd12249, 8; selp.b64 %rd8215, %rd8214, %rd12252, %p1496; add.s64 %rd8216, %rd12250, 8; selp.b64 %rd8217, %rd8216, %rd12253, %p1496; mov.u64 %rd12262, 0; setp.eq.s64 %p1497, %rd12261, 0; add.s64 %rd8218, %rd8213, 4; add.s64 %rd8219, %rd8215, 4; add.s64 %rd8220, %rd8217, 4; selp.b64 %rd2671, %rd8213, %rd8218, %p1497; selp.b64 %rd12252, %rd8215, %rd8219, %p1497; selp.b64 %rd12253, %rd8217, %rd8220, %p1497; selp.b64 %rd12248, %rd8212, %rd12248, %p1496; selp.b64 %rd12249, %rd8214, %rd12249, %p1496; selp.b64 %rd12250, %rd8216, %rd12250, %p1496; add.s64 %rd8221, %rd12251, 8; selp.b64 %rd12247, %rd8221, %rd12247, %p1496; add.s64 %rd8222, %rd12257, 8; setp.eq.s64 %p1498, %rd12254, %rd12260; selp.b64 %rd8223, %rd8222, %rd12254, %p1498; add.s64 %rd8224, %rd12258, 8; selp.b64 %rd8225, %rd8224, %rd12255, %p1498; add.s64 %rd8226, %rd12259, 8; selp.b64 %rd8227, %rd8226, %rd12256, %p1498; selp.b64 %rd12257, %rd8222, %rd12257, %p1498; selp.b64 %rd12258, %rd8224, %rd12258, %p1498; selp.b64 %rd12259, %rd8226, %rd12259, %p1498; add.s64 %rd8228, %rd12254, 8; selp.b64 %rd12260, %rd8228, %rd12260, %p1498; add.s64 %rd8229, %rd8223, 4; add.s64 %rd8230, %rd8225, 4; add.s64 %rd8231, %rd8227, 4; selp.b64 %rd12254, %rd8223, %rd8229, %p1497; selp.b64 %rd12255, %rd8225, %rd8230, %p1497; selp.b64 %rd12256, %rd8227, %rd8231, %p1497; ld.local.f32 %f3455, [%rd8225]; ld.local.f32 %f3456, [%rd8215]; setp.eq.f32 %p1499, %f3456, %f3455; mov.u64 %rd12251, %rd2671; @%p1499 bra $L__BB1_1070; $L__BB1_1072: mov.u64 %rd11149, 0; or.b64 %rd8233, %rd11149, %rd2482; mov.b64 {%r3202, %r3203}, %rd8233; mov.b64 {%r3204, %r3205}, %rd12262; cvt.u32.u64 %r3207, %rd11149; or.b32 %r5090, %r3207, %r3165; mov.u32 %r5091, 0; mov.b32 %f5404, %r3203; mov.u32 %r5092, 1; mov.b32 {%rs1029, %rs589}, %r3204; bra.uni $L__BB1_1077; $L__BB1_1037: sub.f32 %f749, %f728, %f737; sub.f32 %f750, %f729, %f739; mul.f32 %f3394, %f736, %f750; fma.rn.f32 %f751, %f734, %f749, %f3394; mul.f32 %f3395, %f740, %f750; fma.rn.f32 %f752, %f738, %f749, %f3395; setp.ge.f32 %p1451, %f752, 0f00000000; setp.le.f32 %p1452, %f751, %f752; and.pred %p1453, %p1452, %p1451; @%p1453 bra $L__BB1_1065; bra.uni $L__BB1_1038; $L__BB1_1065: add.u64 %rd12234, %SP, 552; cvta.to.local.u64 %rd12232, %rd12234; add.u64 %rd12240, %SP, 0; cvta.to.local.u64 %rd12238, %rd12240; st.local.u64 [%rd12238], %rd2483; mov.u64 %rd12245, 2; mov.u64 %rd12231, %rd2302; mov.u64 %rd12233, %rd12232; mov.u64 %rd12235, %rd12232; mov.u64 %rd12236, %rd12232; mov.u64 %rd12237, %rd12234; mov.u64 %rd12239, %rd12238; mov.u64 %rd12241, %rd12238; mov.u64 %rd12242, %rd12238; mov.u64 %rd12243, %rd12240; mov.u64 %rd12244, %rd2298; $L__BB1_1066: setp.eq.s64 %p1490, %rd12245, 0; mov.u64 %rd12246, 1; @%p1490 bra $L__BB1_1068; add.s64 %rd12245, %rd12245, -1; add.s64 %rd8185, %rd12232, 8; setp.eq.s64 %p1491, %rd12235, %rd12231; selp.b64 %rd8186, %rd8185, %rd12235, %p1491; add.s64 %rd8187, %rd12233, 8; selp.b64 %rd8188, %rd8187, %rd12236, %p1491; add.s64 %rd8189, %rd12234, 8; selp.b64 %rd8190, %rd8189, %rd12237, %p1491; mov.u64 %rd12246, 0; setp.eq.s64 %p1492, %rd12245, 0; add.s64 %rd8191, %rd8186, 4; add.s64 %rd8192, %rd8188, 4; add.s64 %rd8193, %rd8190, 4; selp.b64 %rd2633, %rd8186, %rd8191, %p1492; selp.b64 %rd12236, %rd8188, %rd8192, %p1492; selp.b64 %rd12237, %rd8190, %rd8193, %p1492; selp.b64 %rd12232, %rd8185, %rd12232, %p1491; selp.b64 %rd12233, %rd8187, %rd12233, %p1491; selp.b64 %rd12234, %rd8189, %rd12234, %p1491; add.s64 %rd8194, %rd12235, 8; selp.b64 %rd12231, %rd8194, %rd12231, %p1491; add.s64 %rd8195, %rd12241, 8; setp.eq.s64 %p1493, %rd12238, %rd12244; selp.b64 %rd8196, %rd8195, %rd12238, %p1493; add.s64 %rd8197, %rd12242, 8; selp.b64 %rd8198, %rd8197, %rd12239, %p1493; add.s64 %rd8199, %rd12243, 8; selp.b64 %rd8200, %rd8199, %rd12240, %p1493; selp.b64 %rd12241, %rd8195, %rd12241, %p1493; selp.b64 %rd12242, %rd8197, %rd12242, %p1493; selp.b64 %rd12243, %rd8199, %rd12243, %p1493; add.s64 %rd8201, %rd12238, 8; selp.b64 %rd12244, %rd8201, %rd12244, %p1493; add.s64 %rd8202, %rd8196, 4; add.s64 %rd8203, %rd8198, 4; add.s64 %rd8204, %rd8200, 4; selp.b64 %rd12238, %rd8196, %rd8202, %p1492; selp.b64 %rd12239, %rd8198, %rd8203, %p1492; selp.b64 %rd12240, %rd8200, %rd8204, %p1492; ld.local.f32 %f3453, [%rd8198]; ld.local.f32 %f3454, [%rd8188]; setp.eq.f32 %p1494, %f3454, %f3453; mov.u64 %rd12235, %rd2633; @%p1494 bra $L__BB1_1066; $L__BB1_1068: mov.u64 %rd11148, 0; or.b64 %rd8206, %rd11148, %rd2483; mov.b64 {%r3194, %r3195}, %rd8206; mov.b64 {%r3196, %r3197}, %rd12246; cvt.u32.u64 %r3199, %rd11148; or.b32 %r5090, %r3199, %r3167; mov.u32 %r5091, 0; mov.b32 %f5404, %r3195; mov.b32 {%rs1029, %rs585}, %r3196; mov.u32 %r5092, 2; bra.uni $L__BB1_1077; $L__BB1_1038: sub.f32 %f753, %f737, %f733; sub.f32 %f754, %f739, %f5400; mul.f32 %f3396, %f736, %f738; mul.f32 %f3397, %f734, %f740; sub.f32 %f755, %f3397, %f3396; mul.f32 %f3398, %f736, %f741; mul.f32 %f3399, %f734, %f742; sub.f32 %f3400, %f3399, %f3398; mul.f32 %f3401, %f755, %f3400; setp.lt.f32 %p1454, %f3401, 0f00000000; setp.ge.f32 %p1455, %f743, 0f00000000; and.pred %p1456, %p1455, %p1454; setp.le.f32 %p1457, %f747, 0f00000000; and.pred %p1458, %p1457, %p1456; mov.u16 %rs1028, 0; @%p1458 bra $L__BB1_1041; mul.f32 %f3402, %f738, %f750; mul.f32 %f3403, %f749, %f740; sub.f32 %f3404, %f3402, %f3403; mul.f32 %f3405, %f755, %f3404; setp.gt.f32 %p1459, %f3405, 0f80000000; setp.ge.f32 %p1460, %f744, 0f00000000; and.pred %p1461, %p1460, %p1459; setp.le.f32 %p1462, %f752, 0f00000000; and.pred %p1463, %p1462, %p1461; mov.u16 %rs1028, 1; @%p1463 bra $L__BB1_1041; mul.f32 %f3406, %f753, %f746; mul.f32 %f3407, %f745, %f754; sub.f32 %f3408, %f3406, %f3407; mul.f32 %f3409, %f755, %f3408; setp.lt.f32 %p1464, %f3409, 0f00000000; sub.f32 %f3410, %f748, %f747; setp.ge.f32 %p1465, %f3410, 0f00000000; and.pred %p1466, %p1465, %p1464; sub.f32 %f3411, %f751, %f752; setp.ge.f32 %p1467, %f3411, 0f00000000; and.pred %p1468, %p1467, %p1466; selp.b16 %rs1028, 2, 3, %p1468; $L__BB1_1041: mul.f32 %f3412, %f736, %f736; fma.rn.f32 %f3413, %f734, %f734, %f3412; add.f32 %f756, %f3413, 0f00000000; mul.f32 %f3414, %f740, %f740; fma.rn.f32 %f3415, %f738, %f738, %f3414; add.f32 %f757, %f3415, 0f00000000; mul.f32 %f3416, %f754, %f754; fma.rn.f32 %f3417, %f753, %f753, %f3416; add.f32 %f758, %f3417, 0f00000000; setp.eq.s16 %p1469, %rs1028, 1; @%p1469 bra $L__BB1_1056; setp.eq.s16 %p1470, %rs1028, 2; @%p1470 bra $L__BB1_1052; setp.ne.s16 %p1471, %rs1028, 3; @%p1471 bra $L__BB1_1060; sub.f32 %f3418, %f743, %f747; div.rn.f32 %f759, %f743, %f3418; sub.f32 %f3419, %f744, %f752; div.rn.f32 %f760, %f744, %f3419; sub.f32 %f3420, %f748, %f747; add.f32 %f3421, %f751, %f3420; sub.f32 %f3422, %f3421, %f752; div.rn.f32 %f5402, %f3420, %f3422; mul.f32 %f3423, %f742, %f742; fma.rn.f32 %f3424, %f741, %f741, %f3423; add.f32 %f3425, %f3424, 0f00000000; mul.f32 %f3426, %f756, %f759; mul.f32 %f3427, %f759, %f3426; sub.f32 %f762, %f3425, %f3427; mul.f32 %f3428, %f757, %f5402; mul.f32 %f3429, %f5402, %f3428; sub.f32 %f763, %f3425, %f3429; mul.f32 %f3430, %f746, %f746; fma.rn.f32 %f3431, %f745, %f745, %f3430; add.f32 %f3432, %f3431, 0f00000000; mul.f32 %f3433, %f758, %f760; mul.f32 %f3434, %f760, %f3433; sub.f32 %f764, %f3432, %f3434; setp.lt.f32 %p1472, %f762, %f763; @%p1472 bra $L__BB1_1048; bra.uni $L__BB1_1045; $L__BB1_1048: setp.lt.f32 %p1474, %f762, %f764; @%p1474 bra $L__BB1_1050; bra.uni $L__BB1_1049; $L__BB1_1050: mul.f32 %f5401, %f736, %f759; fma.rn.f32 %f5399, %f734, %f759, %f731; mov.u32 %r5092, 0; mov.f32 %f5400, %f732; mov.f32 %f5402, %f759; bra.uni $L__BB1_1051; $L__BB1_1052: add.u64 %rd12184, %SP, 552; cvta.to.local.u64 %rd12182, %rd12184; add.u64 %rd12190, %SP, 0; cvta.to.local.u64 %rd12188, %rd12190; mul.f32 %f3437, %f754, %f746; fma.rn.f32 %f3438, %f753, %f745, %f3437; div.rn.f32 %f5403, %f3438, %f758; fma.rn.f32 %f3439, %f753, %f5403, %f733; mov.b32 %r3176, %f3439; fma.rn.f32 %f3440, %f754, %f5403, %f5400; mov.b32 %r3177, %f3440; cvt.u64.u32 %rd8098, %r3177; cvt.u64.u32 %rd8099, %r3176; bfi.b64 %rd2491, %rd8098, %rd8099, 32, 32; st.local.u64 [%rd12188], %rd2491; mov.u64 %rd12195, 2; mov.u64 %rd12181, %rd2302; mov.u64 %rd12183, %rd12182; mov.u64 %rd12185, %rd12182; mov.u64 %rd12186, %rd12182; mov.u64 %rd12187, %rd12184; mov.u64 %rd12189, %rd12188; mov.u64 %rd12191, %rd12188; mov.u64 %rd12192, %rd12188; mov.u64 %rd12193, %rd12190; mov.u64 %rd12194, %rd2301; $L__BB1_1053: setp.eq.s64 %p1475, %rd12195, 0; mov.u64 %rd12230, 1; @%p1475 bra $L__BB1_1055; add.s64 %rd12195, %rd12195, -1; add.s64 %rd8104, %rd12182, 8; setp.eq.s64 %p1476, %rd12185, %rd12181; selp.b64 %rd8105, %rd8104, %rd12185, %p1476; add.s64 %rd8106, %rd12183, 8; selp.b64 %rd8107, %rd8106, %rd12186, %p1476; add.s64 %rd8108, %rd12184, 8; selp.b64 %rd8109, %rd8108, %rd12187, %p1476; mov.u64 %rd12230, 0; setp.eq.s64 %p1477, %rd12195, 0; add.s64 %rd8110, %rd8105, 4; add.s64 %rd8111, %rd8107, 4; add.s64 %rd8112, %rd8109, 4; selp.b64 %rd2508, %rd8105, %rd8110, %p1477; selp.b64 %rd12186, %rd8107, %rd8111, %p1477; selp.b64 %rd12187, %rd8109, %rd8112, %p1477; selp.b64 %rd12182, %rd8104, %rd12182, %p1476; selp.b64 %rd12183, %rd8106, %rd12183, %p1476; selp.b64 %rd12184, %rd8108, %rd12184, %p1476; add.s64 %rd8113, %rd12185, 8; selp.b64 %rd12181, %rd8113, %rd12181, %p1476; add.s64 %rd8114, %rd12191, 8; setp.eq.s64 %p1478, %rd12188, %rd12194; selp.b64 %rd8115, %rd8114, %rd12188, %p1478; add.s64 %rd8116, %rd12192, 8; selp.b64 %rd8117, %rd8116, %rd12189, %p1478; add.s64 %rd8118, %rd12193, 8; selp.b64 %rd8119, %rd8118, %rd12190, %p1478; selp.b64 %rd12191, %rd8114, %rd12191, %p1478; selp.b64 %rd12192, %rd8116, %rd12192, %p1478; selp.b64 %rd12193, %rd8118, %rd12193, %p1478; add.s64 %rd8120, %rd12188, 8; selp.b64 %rd12194, %rd8120, %rd12194, %p1478; add.s64 %rd8121, %rd8115, 4; add.s64 %rd8122, %rd8117, 4; add.s64 %rd8123, %rd8119, 4; selp.b64 %rd12188, %rd8115, %rd8121, %p1477; selp.b64 %rd12189, %rd8117, %rd8122, %p1477; selp.b64 %rd12190, %rd8119, %rd8123, %p1477; ld.local.f32 %f3441, [%rd8117]; ld.local.f32 %f3442, [%rd8107]; setp.eq.f32 %p1479, %f3442, %f3441; mov.u64 %rd12185, %rd2508; @%p1479 bra $L__BB1_1053; $L__BB1_1055: mov.u64 %rd11145, 0; or.b64 %rd12229, %rd11145, %rd2491; mov.u32 %r5092, 1; bra.uni $L__BB1_1064; $L__BB1_1056: add.u64 %rd12200, %SP, 552; cvta.to.local.u64 %rd12198, %rd12200; add.u64 %rd12206, %SP, 0; cvta.to.local.u64 %rd12204, %rd12206; div.rn.f32 %f5403, %f744, %f757; fma.rn.f32 %f3443, %f738, %f5403, %f731; mov.b32 %r3179, %f3443; fma.rn.f32 %f3444, %f740, %f5403, %f732; mov.b32 %r3180, %f3444; cvt.u64.u32 %rd8125, %r3180; cvt.u64.u32 %rd8126, %r3179; bfi.b64 %rd2532, %rd8125, %rd8126, 32, 32; st.local.u64 [%rd12204], %rd2532; mov.u64 %rd12211, 2; mov.u64 %rd12197, %rd2302; mov.u64 %rd12199, %rd12198; mov.u64 %rd12201, %rd12198; mov.u64 %rd12202, %rd12198; mov.u64 %rd12203, %rd12200; mov.u64 %rd12205, %rd12204; mov.u64 %rd12207, %rd12204; mov.u64 %rd12208, %rd12204; mov.u64 %rd12209, %rd12206; mov.u64 %rd12210, %rd2300; $L__BB1_1057: setp.eq.s64 %p1480, %rd12211, 0; mov.u64 %rd12230, 1; @%p1480 bra $L__BB1_1059; add.s64 %rd12211, %rd12211, -1; add.s64 %rd8131, %rd12198, 8; setp.eq.s64 %p1481, %rd12201, %rd12197; selp.b64 %rd8132, %rd8131, %rd12201, %p1481; add.s64 %rd8133, %rd12199, 8; selp.b64 %rd8134, %rd8133, %rd12202, %p1481; add.s64 %rd8135, %rd12200, 8; selp.b64 %rd8136, %rd8135, %rd12203, %p1481; mov.u64 %rd12230, 0; setp.eq.s64 %p1482, %rd12211, 0; add.s64 %rd8137, %rd8132, 4; add.s64 %rd8138, %rd8134, 4; add.s64 %rd8139, %rd8136, 4; selp.b64 %rd2549, %rd8132, %rd8137, %p1482; selp.b64 %rd12202, %rd8134, %rd8138, %p1482; selp.b64 %rd12203, %rd8136, %rd8139, %p1482; selp.b64 %rd12198, %rd8131, %rd12198, %p1481; selp.b64 %rd12199, %rd8133, %rd12199, %p1481; selp.b64 %rd12200, %rd8135, %rd12200, %p1481; add.s64 %rd8140, %rd12201, 8; selp.b64 %rd12197, %rd8140, %rd12197, %p1481; add.s64 %rd8141, %rd12207, 8; setp.eq.s64 %p1483, %rd12204, %rd12210; selp.b64 %rd8142, %rd8141, %rd12204, %p1483; add.s64 %rd8143, %rd12208, 8; selp.b64 %rd8144, %rd8143, %rd12205, %p1483; add.s64 %rd8145, %rd12209, 8; selp.b64 %rd8146, %rd8145, %rd12206, %p1483; selp.b64 %rd12207, %rd8141, %rd12207, %p1483; selp.b64 %rd12208, %rd8143, %rd12208, %p1483; selp.b64 %rd12209, %rd8145, %rd12209, %p1483; add.s64 %rd8147, %rd12204, 8; selp.b64 %rd12210, %rd8147, %rd12210, %p1483; add.s64 %rd8148, %rd8142, 4; add.s64 %rd8149, %rd8144, 4; add.s64 %rd8150, %rd8146, 4; selp.b64 %rd12204, %rd8142, %rd8148, %p1482; selp.b64 %rd12205, %rd8144, %rd8149, %p1482; selp.b64 %rd12206, %rd8146, %rd8150, %p1482; ld.local.f32 %f3445, [%rd8144]; ld.local.f32 %f3446, [%rd8134]; setp.eq.f32 %p1484, %f3446, %f3445; mov.u64 %rd12201, %rd2549; @%p1484 bra $L__BB1_1057; $L__BB1_1059: mov.u64 %rd11146, 0; or.b64 %rd12229, %rd11146, %rd2532; mov.u32 %r5092, 2; bra.uni $L__BB1_1064; $L__BB1_1060: div.rn.f32 %f5403, %f743, %f756; fma.rn.f32 %f3447, %f734, %f5403, %f731; mov.b32 %r3182, %f3447; fma.rn.f32 %f3448, %f736, %f5403, %f732; mov.b32 %r3183, %f3448; cvt.u64.u32 %rd8152, %r3183; cvt.u64.u32 %rd8153, %r3182; bfi.b64 %rd2573, %rd8152, %rd8153, 32, 32; st.local.u64 [%rd7849], %rd2573; mov.u64 %rd12227, 2; mov.u64 %rd12213, %rd2302; mov.u64 %rd12214, %rd7861; mov.u64 %rd12215, %rd7861; mov.u64 %rd12216, %rd7860; mov.u64 %rd12217, %rd7861; mov.u64 %rd12218, %rd7861; mov.u64 %rd12219, %rd7860; mov.u64 %rd12220, %rd7849; mov.u64 %rd12221, %rd7849; mov.u64 %rd12222, %rd7848; mov.u64 %rd12223, %rd7849; mov.u64 %rd12224, %rd7849; mov.u64 %rd12225, %rd7848; mov.u64 %rd12226, %rd2299; $L__BB1_1061: setp.eq.s64 %p1485, %rd12227, 0; mov.u64 %rd12230, 1; @%p1485 bra $L__BB1_1063; add.s64 %rd12227, %rd12227, -1; add.s64 %rd8158, %rd12214, 8; setp.eq.s64 %p1486, %rd12217, %rd12213; selp.b64 %rd8159, %rd8158, %rd12217, %p1486; add.s64 %rd8160, %rd12215, 8; selp.b64 %rd8161, %rd8160, %rd12218, %p1486; add.s64 %rd8162, %rd12216, 8; selp.b64 %rd8163, %rd8162, %rd12219, %p1486; mov.u64 %rd12230, 0; setp.eq.s64 %p1487, %rd12227, 0; add.s64 %rd8164, %rd8159, 4; add.s64 %rd8165, %rd8161, 4; add.s64 %rd8166, %rd8163, 4; selp.b64 %rd2590, %rd8159, %rd8164, %p1487; selp.b64 %rd12218, %rd8161, %rd8165, %p1487; selp.b64 %rd12219, %rd8163, %rd8166, %p1487; selp.b64 %rd12214, %rd8158, %rd12214, %p1486; selp.b64 %rd12215, %rd8160, %rd12215, %p1486; selp.b64 %rd12216, %rd8162, %rd12216, %p1486; add.s64 %rd8167, %rd12217, 8; selp.b64 %rd12213, %rd8167, %rd12213, %p1486; add.s64 %rd8168, %rd12223, 8; setp.eq.s64 %p1488, %rd12220, %rd12226; selp.b64 %rd8169, %rd8168, %rd12220, %p1488; add.s64 %rd8170, %rd12224, 8; selp.b64 %rd8171, %rd8170, %rd12221, %p1488; add.s64 %rd8172, %rd12225, 8; selp.b64 %rd8173, %rd8172, %rd12222, %p1488; selp.b64 %rd12223, %rd8168, %rd12223, %p1488; selp.b64 %rd12224, %rd8170, %rd12224, %p1488; selp.b64 %rd12225, %rd8172, %rd12225, %p1488; add.s64 %rd8174, %rd12220, 8; selp.b64 %rd12226, %rd8174, %rd12226, %p1488; add.s64 %rd8175, %rd8169, 4; add.s64 %rd8176, %rd8171, 4; add.s64 %rd8177, %rd8173, 4; selp.b64 %rd12220, %rd8169, %rd8175, %p1487; selp.b64 %rd12221, %rd8171, %rd8176, %p1487; selp.b64 %rd12222, %rd8173, %rd8177, %p1487; ld.local.f32 %f3449, [%rd8171]; ld.local.f32 %f3450, [%rd8161]; setp.eq.f32 %p1489, %f3450, %f3449; mov.u64 %rd12217, %rd2590; @%p1489 bra $L__BB1_1061; $L__BB1_1063: mov.u64 %rd11147, 0; or.b64 %rd12229, %rd11147, %rd2573; mov.u32 %r5092, 0; $L__BB1_1064: mov.f32 %f3451, 0f3F800000; sub.f32 %f3452, %f3451, %f5403; mov.b32 %r3186, %f3452; mov.b32 %r3187, %f5403; cvt.u64.u32 %rd8178, %r3187; cvt.u64.u32 %rd8179, %r3186; bfi.b64 %rd12279, %rd8178, %rd8179, 32, 32; mov.b64 {%r3188, %r3189}, %rd12230; mov.b64 {%r3190, %r3191}, %rd12229; cvt.u32.u64 %r5090, %rd12229; mov.b32 %f5404, %r3191; mov.u32 %r5091, 1; mov.b32 {%rs1029, %rs581}, %r3188; bra.uni $L__BB1_1077; $L__BB1_1045: setp.lt.f32 %p1473, %f763, %f764; @%p1473 bra $L__BB1_1047; bra.uni $L__BB1_1046; $L__BB1_1047: mul.f32 %f5401, %f740, %f760; fma.rn.f32 %f5399, %f738, %f760, %f731; mov.u32 %r5092, 2; mov.f32 %f5400, %f732; mov.f32 %f5402, %f760; bra.uni $L__BB1_1051; $L__BB1_1049: mul.f32 %f5401, %f754, %f5402; fma.rn.f32 %f5399, %f753, %f5402, %f733; mov.u32 %r5092, 1; bra.uni $L__BB1_1051; $L__BB1_1046: mul.f32 %f5401, %f754, %f5402; fma.rn.f32 %f5399, %f753, %f5402, %f733; mov.u32 %r5092, 1; $L__BB1_1051: add.f32 %f5404, %f5400, %f5401; mov.f32 %f3435, 0f3F800000; sub.f32 %f3436, %f3435, %f5402; mov.b32 %r3174, %f3436; mov.b32 %r3175, %f5402; cvt.u64.u32 %rd8095, %r3175; cvt.u64.u32 %rd8096, %r3174; bfi.b64 %rd12279, %rd8095, %rd8096, 32, 32; mov.b32 %r5090, %f5399; mov.u32 %r5091, 1; mov.u16 %rs1029, 1; $L__BB1_1077: mov.b32 %f3459, %r5090; sub.f32 %f3460, %f3459, %f728; sub.f32 %f3461, %f5404, %f729; mul.f32 %f3462, %f3461, %f3461; fma.rn.f32 %f3463, %f3460, %f3460, %f3462; add.f32 %f3464, %f3463, 0f00000000; sqrt.rn.f32 %f3465, %f3464; shl.b64 %rd8263, %rd2472, 2; add.s64 %rd8264, %rd24, %rd8263; st.local.f32 [%rd8264+-4], %f3465; mul.lo.s64 %rd8265, %rd2472, 36; add.s64 %rd8266, %rd7, %rd8265; st.local.u32 [%rd8266+-36], %r5090; st.local.f32 [%rd8266+-32], %f5404; mov.u16 %rs594, 0; st.local.v4.u8 [%rd8266+-28], {%rs1029, %rs594, %rs594, %rs594}; st.local.u32 [%rd8266+-24], %r767; st.local.u32 [%rd8266+-20], %r5091; st.local.u32 [%rd8266+-16], %r5092; shr.u64 %rd8267, %rd12279, 32; st.local.u32 [%rd8266+-8], %rd8267; st.local.u32 [%rd8266+-12], %rd12279; $L__BB1_1078: setp.lt.u64 %p1505, %rd2472, 4; add.s64 %rd2472, %rd2472, 1; @%p1505 bra $L__BB1_1025; ld.local.v2.u64 {%rd12280, %rd12281}, [%rd24]; ld.local.v4.u32 {%r5102, %r5103, %r5104, %r3219}, [%rd7]; ld.local.u32 %r5105, [%rd7+16]; ld.local.u32 %rd8270, [%rd2313+4]; ld.local.u32 %rd8271, [%rd2313+8]; bfi.b64 %rd8272, %rd8271, %rd8270, 32, 32; mov.b64 {%r5099, %r5100}, %rd8272; ld.local.u32 %r5101, [%rd2313+12]; ld.local.u32 %r5106, [%rd2314+4]; ld.local.u32 %r5098, [%rd2315+16]; ld.local.u64 %rd8273, [%rd2315+8]; mov.b64 {%r5096, %r5097}, %rd8273; ld.local.u32 %r5107, [%rd2316+8]; ld.local.u32 %rd8274, [%rd2317+12]; ld.local.u32 %rd8275, [%rd2317+16]; bfi.b64 %rd8276, %rd8275, %rd8274, 32, 32; mov.b64 {%r5093, %r5094}, %rd8276; ld.local.u32 %r5095, [%rd2317+20]; ld.local.u32 %r5108, [%rd2318+12]; bra.uni $L__BB1_1080; $L__BB1_1023: mov.u32 %r5105, 4; mov.u32 %r5106, %r5105; mov.u32 %r5107, %r5105; mov.u32 %r5108, %r5105; $L__BB1_1080: and.b64 %rd8277, %rd2468, 1; setp.eq.b64 %p1506, %rd8277, 1; mov.pred %p1507, 0; xor.pred %p1508, %p1506, %p1507; not.pred %p1509, %p1508; mov.b64 {%r810, %r811}, %rd12280; mov.b32 %f787, %r810; mov.b32 %f788, %r811; mov.b64 {%r812, %r813}, %rd12281; mov.b32 %f789, %r812; mov.b32 %f790, %r813; @%p1509 bra $L__BB1_1089; bra.uni $L__BB1_1081; $L__BB1_1089: and.b64 %rd8295, %rd2468, 2; setp.eq.s64 %p1523, %rd8295, 0; @%p1523 bra $L__BB1_1098; bra.uni $L__BB1_1090; $L__BB1_1098: and.b64 %rd8313, %rd2468, 4; setp.eq.s64 %p1537, %rd8313, 0; @%p1537 bra $L__BB1_1107; bra.uni $L__BB1_1099; $L__BB1_1107: and.b64 %rd8331, %rd2468, 8; setp.eq.s64 %p1551, %rd8331, 0; @%p1551 bra $L__BB1_1015; ld.u8 %rs601, [%rd2457+88]; and.b16 %rs602, %rs601, 1; setp.eq.b16 %p1552, %rs602, 1; mov.pred %p1553, 0; xor.pred %p1554, %p1552, %p1553; not.pred %p1555, %p1554; @%p1555 bra $L__BB1_1111; bra.uni $L__BB1_1109; $L__BB1_1111: ld.u32 %r861, [%rd2457+76]; cvt.u64.u32 %rd8335, %r861; setp.le.u64 %p1562, %rd2445, %rd8335; @%p1562 bra $L__BB1_1015; neg.f32 %f794, %f790; setp.lt.u32 %p1563, %r766, 64; @%p1563 bra $L__BB1_1114; bra.uni $L__BB1_1113; $L__BB1_1114: mul.wide.u32 %rd8347, %r766, 8; add.s64 %rd8348, %rd2446, %rd8347; mov.u64 %rd12288, 0; st.local.u32 [%rd8348], %r861; st.local.f32 [%rd8348+4], %f794; add.s32 %r766, %r766, 1; st.local.u32 [%rd2446+512], %r766; mov.u64 %rd12289, %rd12288; bra.uni $L__BB1_1115; $L__BB1_1081: ld.u8 %rs595, [%rd2457+88]; and.b16 %rs596, %rs595, 1; setp.eq.b16 %p1510, %rs596, 1; xor.pred %p1512, %p1510, %p1507; not.pred %p1513, %p1512; @%p1513 bra $L__BB1_1084; bra.uni $L__BB1_1082; $L__BB1_1084: ld.u32 %r819, [%rd2457+64]; cvt.u64.u32 %rd8281, %r819; setp.le.u64 %p1520, %rd2445, %rd8281; @%p1520 bra $L__BB1_1089; neg.f32 %f791, %f787; setp.lt.u32 %p1521, %r766, 64; @%p1521 bra $L__BB1_1087; bra.uni $L__BB1_1086; $L__BB1_1087: add.s32 %r3222, %r765, -1; mul.wide.u32 %rd8293, %r3222, 8; add.s64 %rd8294, %rd2446, %rd8293; mov.u64 %rd12282, 0; st.local.u32 [%rd8294], %r819; st.local.f32 [%rd8294+4], %f791; add.s32 %r766, %r766, 1; st.local.u32 [%rd2446+512], %r766; mov.u64 %rd12283, %rd12282; bra.uni $L__BB1_1088; $L__BB1_1090: ld.u8 %rs597, [%rd2457+88]; and.b16 %rs598, %rs597, 1; setp.eq.b16 %p1524, %rs598, 1; mov.pred %p1525, 0; xor.pred %p1526, %p1524, %p1525; not.pred %p1527, %p1526; @%p1527 bra $L__BB1_1093; bra.uni $L__BB1_1091; $L__BB1_1093: ld.u32 %r833, [%rd2457+68]; cvt.u64.u32 %rd8299, %r833; setp.le.u64 %p1534, %rd2445, %rd8299; @%p1534 bra $L__BB1_1098; neg.f32 %f792, %f788; setp.lt.u32 %p1535, %r766, 64; @%p1535 bra $L__BB1_1096; bra.uni $L__BB1_1095; $L__BB1_1096: mul.wide.u32 %rd8311, %r766, 8; add.s64 %rd8312, %rd2446, %rd8311; mov.u64 %rd12284, 0; st.local.u32 [%rd8312], %r833; st.local.f32 [%rd8312+4], %f792; add.s32 %r766, %r766, 1; st.local.u32 [%rd2446+512], %r766; mov.u64 %rd12285, %rd12284; bra.uni $L__BB1_1097; $L__BB1_1099: ld.u8 %rs599, [%rd2457+88]; and.b16 %rs600, %rs599, 1; setp.eq.b16 %p1538, %rs600, 1; mov.pred %p1539, 0; xor.pred %p1540, %p1538, %p1539; not.pred %p1541, %p1540; @%p1541 bra $L__BB1_1102; bra.uni $L__BB1_1100; $L__BB1_1102: ld.u32 %r847, [%rd2457+72]; cvt.u64.u32 %rd8317, %r847; setp.le.u64 %p1548, %rd2445, %rd8317; @%p1548 bra $L__BB1_1107; neg.f32 %f793, %f789; setp.lt.u32 %p1549, %r766, 64; @%p1549 bra $L__BB1_1105; bra.uni $L__BB1_1104; $L__BB1_1105: mul.wide.u32 %rd8329, %r766, 8; add.s64 %rd8330, %rd2446, %rd8329; mov.u64 %rd12286, 0; st.local.u32 [%rd8330], %r847; st.local.f32 [%rd8330+4], %f793; add.s32 %r766, %r766, 1; st.local.u32 [%rd2446+512], %r766; mov.u64 %rd12287, %rd12286; bra.uni $L__BB1_1106; $L__BB1_1082: setp.leu.f32 %p1514, %f730, %f787; setp.eq.s32 %p1515, %r5105, 4; or.pred %p1516, %p1515, %p1514; @%p1516 bra $L__BB1_1089; ld.u32 %r3220, [%rd2457+64]; cvt.u64.u32 %rd8278, %r3220; setp.le.u64 %p1517, %rd2448, %rd8278; mul.wide.u32 %rd8279, %r3220, 12; add.s64 %rd8280, %rd2449, %rd8279; setp.eq.s64 %p1518, %rd8280, 0; or.pred %p1519, %p1517, %p1518; selp.b32 %r761, %r761, %r5104, %p1519; selp.b32 %r760, %r760, %r5103, %p1519; selp.b32 %r759, %r759, %r5102, %p1519; selp.b32 %r763, %r763, %r5105, %p1519; selp.b32 %r764, %r764, %r810, %p1519; bra.uni $L__BB1_1089; $L__BB1_1091: mov.b32 %f3466, %r764; setp.leu.f32 %p1528, %f3466, %f788; setp.eq.s32 %p1529, %r5106, 4; or.pred %p1530, %p1529, %p1528; @%p1530 bra $L__BB1_1098; ld.u32 %r3228, [%rd2457+68]; cvt.u64.u32 %rd8296, %r3228; setp.le.u64 %p1531, %rd2448, %rd8296; mul.wide.u32 %rd8297, %r3228, 12; add.s64 %rd8298, %rd2449, %rd8297; setp.eq.s64 %p1532, %rd8298, 0; or.pred %p1533, %p1531, %p1532; selp.b32 %r761, %r761, %r5101, %p1533; selp.b32 %r760, %r760, %r5100, %p1533; selp.b32 %r759, %r759, %r5099, %p1533; selp.b32 %r763, %r763, %r5106, %p1533; selp.b32 %r764, %r764, %r811, %p1533; bra.uni $L__BB1_1098; $L__BB1_1100: mov.b32 %f3467, %r764; setp.leu.f32 %p1542, %f3467, %f789; setp.eq.s32 %p1543, %r5107, 4; or.pred %p1544, %p1543, %p1542; @%p1544 bra $L__BB1_1107; ld.u32 %r3235, [%rd2457+72]; cvt.u64.u32 %rd8314, %r3235; setp.le.u64 %p1545, %rd2448, %rd8314; mul.wide.u32 %rd8315, %r3235, 12; add.s64 %rd8316, %rd2449, %rd8315; setp.eq.s64 %p1546, %rd8316, 0; or.pred %p1547, %p1545, %p1546; selp.b32 %r761, %r761, %r5098, %p1547; selp.b32 %r760, %r760, %r5097, %p1547; selp.b32 %r759, %r759, %r5096, %p1547; selp.b32 %r763, %r763, %r5107, %p1547; selp.b32 %r764, %r764, %r812, %p1547; bra.uni $L__BB1_1107; $L__BB1_1109: mov.b32 %f3468, %r764; setp.leu.f32 %p1556, %f3468, %f790; setp.eq.s32 %p1557, %r5108, 4; or.pred %p1558, %p1557, %p1556; @%p1558 bra $L__BB1_1015; bra.uni $L__BB1_1110; $L__BB1_1113: mov.u64 %rd12289, 1; shl.b64 %rd12288, %rd8335, 32; $L__BB1_1115: mov.u64 %rd11160, 0; cvt.u32.u64 %r3244, %rd11160; cvt.u32.u64 %r3245, %rd12288; or.b32 %r3246, %r3245, %r3244; cvt.u32.u64 %r3247, %rd12289; or.b32 %r3248, %r3246, %r3247; setp.eq.s32 %p1564, %r3248, 0; @%p1564 bra $L__BB1_1015; bra.uni $L__BB1_1116; $L__BB1_1086: mov.u64 %rd12283, 1; shl.b64 %rd12282, %rd8281, 32; $L__BB1_1088: mov.u64 %rd11151, 0; cvt.u32.u64 %r3223, %rd11151; cvt.u32.u64 %r3224, %rd12282; or.b32 %r3225, %r3224, %r3223; cvt.u32.u64 %r3226, %rd12283; or.b32 %r3227, %r3225, %r3226; setp.ne.s32 %p1522, %r3227, 0; @%p1522 bra $L__BB1_1116; bra.uni $L__BB1_1089; $L__BB1_1095: mov.u64 %rd12285, 1; shl.b64 %rd12284, %rd8299, 32; $L__BB1_1097: mov.u64 %rd11154, 0; cvt.u32.u64 %r3230, %rd11154; cvt.u32.u64 %r3231, %rd12284; or.b32 %r3232, %r3231, %r3230; cvt.u32.u64 %r3233, %rd12285; or.b32 %r3234, %r3232, %r3233; setp.ne.s32 %p1536, %r3234, 0; @%p1536 bra $L__BB1_1116; bra.uni $L__BB1_1098; $L__BB1_1104: mov.u64 %rd12287, 1; shl.b64 %rd12286, %rd8317, 32; $L__BB1_1106: mov.u64 %rd11157, 0; cvt.u32.u64 %r3237, %rd11157; cvt.u32.u64 %r3238, %rd12286; or.b32 %r3239, %r3238, %r3237; cvt.u32.u64 %r3240, %rd12287; or.b32 %r3241, %r3239, %r3240; setp.ne.s32 %p1550, %r3241, 0; @%p1550 bra $L__BB1_1116; bra.uni $L__BB1_1107; $L__BB1_1117: setp.eq.s32 %p1565, %r763, 4; mov.u64 %rd12290, %rd8003; mov.u64 %rd12291, %rd8001; mov.u64 %rd12292, %rd8003; @%p1565 bra $L__BB1_1119; mov.b64 %rd12292, {%r759, %r760}; mov.b32 {%rs603, %rs604}, %r761; mov.b64 %rd8355, {%r761, %r3249}; and.b64 %rd12290, %rd8355, 4294967040; cvt.u64.u16 %rd8356, %rs603; and.b64 %rd12291, %rd8356, 255; $L__BB1_1119: or.b64 %rd8363, %rd12291, %rd12290; or.b64 %rd8364, %rd8363, %rd8003; mov.b64 {%r3250, %r3251}, %rd8364; mov.b32 {%rs110, %rs605}, %r3250; and.b16 %rs606, %rs110, 255; setp.eq.s16 %p1566, %rs606, 2; @%p1566 bra $L__BB1_1121; cvt.u32.u64 %r3252, %rd12292; mov.b32 %f3469, %r3252; shr.u64 %rd8365, %rd12292, 32; cvt.u32.u64 %r3253, %rd8365; mov.b32 %f3470, %r3253; ld.global.f32 %f3471, [%rd2320+248]; mul.f32 %f3472, %f3471, %f3469; ld.global.f32 %f3473, [%rd2320+252]; mul.f32 %f3474, %f3473, %f3470; sub.f32 %f3475, %f3472, %f3474; mul.f32 %f3476, %f3473, %f3469; fma.rn.f32 %f3477, %f3471, %f3470, %f3476; ld.global.f32 %f3478, [%rd2320+256]; add.f32 %f3479, %f3478, %f3475; mov.b32 %r3254, %f3479; ld.global.f32 %f3480, [%rd2320+260]; add.f32 %f3481, %f3480, %f3477; mov.b32 %r3255, %f3481; cvt.u64.u32 %rd8366, %r3255; cvt.u64.u32 %rd8367, %r3254; cvt.u64.u16 %rd8368, %rs110; bfi.b64 %rd8003, %rd8366, %rd8367, 32, 32; and.b64 %rd8369, %rd8368, 255; mov.b64 {%r3256, %r3257}, %rd8369; mov.b32 {%rs607, %rs608}, %r3256; cvt.u64.u16 %rd8001, %rs607; $L__BB1_1121: mov.u64 %rd11169, 0; or.b64 %rd8376, %rd11169, %rd8001; or.b64 %rd2796, %rd8376, %rd11169; mov.b64 {%r3258, %r3259}, %rd2796; mov.b32 {%rs111, %rs609}, %r3258; and.b16 %rs610, %rs111, 255; setp.eq.s16 %p1567, %rs610, 2; mov.u64 %rd12295, 2; mov.u64 %rd12296, %rd11169; mov.u64 %rd12297, %rd11169; @%p1567 bra $L__BB1_1123; and.b64 %rd8378, %rd2796, 4294967040; cvt.u64.u16 %rd8379, %rs111; and.b64 %rd8380, %rd8379, 255; or.b64 %rd8381, %rd8380, %rd11169; or.b64 %rd8382, %rd8381, %rd8378; mov.b64 {%r3260, %r3261}, %rd8382; mov.b32 {%rs611, %rs612}, %r3260; not.b16 %rs613, %rs611; ld.global.u8 %rs614, [%rd2320+240]; setp.eq.s16 %p1568, %rs614, 0; and.b16 %rs615, %rs613, 1; selp.b16 %rs616, %rs611, %rs615, %p1568; and.b64 %rd8383, %rd8382, 4294967040; cvt.u64.u16 %rd8384, %rs616; and.b64 %rd8385, %rd8384, 255; or.b64 %rd8386, %rd8383, %rd11169; or.b64 %rd8387, %rd8386, %rd8385; mov.b64 {%r3262, %r3263}, %rd8387; mov.b32 {%rs617, %rs618}, %r3262; and.b64 %rd12297, %rd8387, 4294967040; cvt.u64.u16 %rd8388, %rs617; and.b64 %rd12295, %rd8388, 255; mov.u64 %rd12296, %rd8003; $L__BB1_1123: or.b64 %rd8389, %rd12296, %rd11169; or.b64 %rd8390, %rd11169, %rd12295; or.b64 %rd8391, %rd8390, %rd12297; or.b64 %rd8392, %rd8389, %rd11169; mov.b64 {%r5139, %r5140}, %rd8392; mov.b64 {%r5141, %r3264}, %rd8391; bra.uni $L__BB1_1180; $L__BB1_983: cvt.u32.u64 %r3057, %rd2325; cvt.u32.u64 %r3058, %rd2344; rem.u32 %r3059, %r3058, %r3057; cvt.u64.u32 %rd12137, %r3059; $L__BB1_984: shl.b64 %rd7906, %rd12137, 3; add.s64 %rd2348, %rd2326, %rd7906; ld.u32 %rd7907, [%rd2348]; ld.u32 %rd7908, [%rd2348+4]; bfi.b64 %rd2349, %rd7908, %rd7907, 32, 32; add.s64 %rd2350, %rd12137, 1; or.b64 %rd7909, %rd2350, %rd2325; and.b64 %rd7910, %rd7909, -4294967296; setp.eq.s64 %p1385, %rd7910, 0; @%p1385 bra $L__BB1_986; rem.u64 %rd12138, %rd2350, %rd2325; bra.uni $L__BB1_987; $L__BB1_986: cvt.u32.u64 %r3060, %rd2325; cvt.u32.u64 %r3061, %rd2350; rem.u32 %r3062, %r3061, %r3060; cvt.u64.u32 %rd12138, %r3062; $L__BB1_987: add.u64 %rd12148, %SP, 560; cvta.to.local.u64 %rd12146, %rd12148; shl.b64 %rd7912, %rd12138, 3; add.s64 %rd2360, %rd2326, %rd7912; ld.u32 %rd7913, [%rd2360]; ld.u32 %rd7914, [%rd2360+4]; bfi.b64 %rd7915, %rd7914, %rd7913, 32, 32; st.local.v2.u64 [%rd12146], {%rd2349, %rd7915}; mov.u64 %rd12153, 2; mov.u64 %rd12139, %rd2308; mov.u64 %rd12140, %rd2306; mov.u64 %rd12141, %rd2306; mov.u64 %rd12142, %rd2307; mov.u64 %rd12143, %rd2306; mov.u64 %rd12144, %rd2306; mov.u64 %rd12145, %rd2307; mov.u64 %rd12147, %rd12146; mov.u64 %rd12149, %rd12146; mov.u64 %rd12150, %rd12146; mov.u64 %rd12151, %rd12148; mov.u64 %rd12152, %rd2309; $L__BB1_988: setp.eq.s64 %p1386, %rd12153, 0; @%p1386 bra $L__BB1_991; add.s64 %rd12153, %rd12153, -1; add.s64 %rd7916, %rd12140, 8; setp.eq.s64 %p1387, %rd12143, %rd12139; selp.b64 %rd7917, %rd7916, %rd12143, %p1387; add.s64 %rd7918, %rd12141, 8; selp.b64 %rd7919, %rd7918, %rd12144, %p1387; add.s64 %rd7920, %rd12142, 8; selp.b64 %rd7921, %rd7920, %rd12145, %p1387; setp.eq.s64 %p1388, %rd12153, 0; add.s64 %rd7922, %rd7917, 4; add.s64 %rd7923, %rd7919, 4; add.s64 %rd7924, %rd7921, 4; selp.b64 %rd2377, %rd7917, %rd7922, %p1388; selp.b64 %rd12144, %rd7919, %rd7923, %p1388; selp.b64 %rd12145, %rd7921, %rd7924, %p1388; selp.b64 %rd12140, %rd7916, %rd12140, %p1387; selp.b64 %rd12141, %rd7918, %rd12141, %p1387; selp.b64 %rd12142, %rd7920, %rd12142, %p1387; add.s64 %rd7925, %rd12143, 8; selp.b64 %rd12139, %rd7925, %rd12139, %p1387; add.s64 %rd7926, %rd12149, 8; setp.eq.s64 %p1389, %rd12146, %rd12152; selp.b64 %rd7927, %rd7926, %rd12146, %p1389; add.s64 %rd7928, %rd12150, 8; selp.b64 %rd7929, %rd7928, %rd12147, %p1389; add.s64 %rd7930, %rd12151, 8; selp.b64 %rd7931, %rd7930, %rd12148, %p1389; selp.b64 %rd12149, %rd7926, %rd12149, %p1389; selp.b64 %rd12150, %rd7928, %rd12150, %p1389; selp.b64 %rd12151, %rd7930, %rd12151, %p1389; add.s64 %rd7932, %rd12146, 8; selp.b64 %rd12152, %rd7932, %rd12152, %p1389; add.s64 %rd7933, %rd7927, 4; add.s64 %rd7934, %rd7929, 4; add.s64 %rd7935, %rd7931, 4; selp.b64 %rd12146, %rd7927, %rd7933, %p1388; selp.b64 %rd12147, %rd7929, %rd7934, %p1388; selp.b64 %rd12148, %rd7931, %rd7935, %p1388; ld.local.f32 %f3235, [%rd7929]; ld.local.f32 %f3236, [%rd7919]; setp.eq.f32 %p1390, %f3236, %f3235; mov.u64 %rd12143, %rd2377; @%p1390 bra $L__BB1_988; bra.uni $L__BB1_990; $L__BB1_991: ld.u32 %rd7936, [%rd2348]; ld.u32 %rd7937, [%rd2348+4]; bfi.b64 %rd7938, %rd7937, %rd7936, 32, 32; cvt.u32.u64 %r3063, %rd7938; mov.b32 %f3237, %r3063; shr.u64 %rd7939, %rd7938, 32; cvt.u32.u64 %r3064, %rd7939; mov.b32 %f3238, %r3064; ld.u32 %rd7940, [%rd2360]; ld.u32 %rd7941, [%rd2360+4]; bfi.b64 %rd7942, %rd7941, %rd7940, 32, 32; cvt.u32.u64 %r3065, %rd7942; shr.u64 %rd7943, %rd7942, 32; cvt.u32.u64 %r3066, %rd7943; mov.b32 %f3239, %r3065; sub.f32 %f5397, %f3239, %f3237; mov.b32 %f3240, %r3066; sub.f32 %f5398, %f3240, %f3238; bra.uni $L__BB1_1002; $L__BB1_996: cvt.u32.u64 %r3067, %rd2325; cvt.u32.u64 %r3068, %rd2391; rem.u32 %r3069, %r3068, %r3067; cvt.u64.u32 %rd12154, %r3069; $L__BB1_997: shl.b64 %rd7952, %rd12154, 3; add.s64 %rd7953, %rd2326, %rd7952; ld.u32 %rd7954, [%rd7953]; ld.u32 %rd7955, [%rd7953+4]; bfi.b64 %rd2402, %rd7955, %rd7954, 32, 32; add.u64 %rd7957, %SPL, 560; st.local.v2.u64 [%rd7957], {%rd2392, %rd2402}; mov.u64 %rd12169, 2; mov.u64 %rd12155, %rd2306; mov.u64 %rd12156, %rd2303; mov.u64 %rd12157, %rd2303; mov.u64 %rd12158, %rd2305; mov.u64 %rd12159, %rd2303; mov.u64 %rd12160, %rd2303; mov.u64 %rd12161, %rd2305; mov.u64 %rd12162, %rd2310; mov.u64 %rd12163, %rd2310; mov.u64 %rd12164, %rd2311; mov.u64 %rd12165, %rd2310; mov.u64 %rd12166, %rd2310; mov.u64 %rd12167, %rd2311; mov.u64 %rd12168, %rd2312; $L__BB1_998: setp.eq.s64 %p1394, %rd12169, 0; @%p1394 bra $L__BB1_1001; add.s64 %rd12169, %rd12169, -1; add.s64 %rd7958, %rd12156, 8; setp.eq.s64 %p1395, %rd12159, %rd12155; selp.b64 %rd7959, %rd7958, %rd12159, %p1395; add.s64 %rd7960, %rd12157, 8; selp.b64 %rd7961, %rd7960, %rd12160, %p1395; add.s64 %rd7962, %rd12158, 8; selp.b64 %rd7963, %rd7962, %rd12161, %p1395; setp.eq.s64 %p1396, %rd12169, 0; add.s64 %rd7964, %rd7959, 4; add.s64 %rd7965, %rd7961, 4; add.s64 %rd7966, %rd7963, 4; selp.b64 %rd2419, %rd7959, %rd7964, %p1396; selp.b64 %rd12160, %rd7961, %rd7965, %p1396; selp.b64 %rd12161, %rd7963, %rd7966, %p1396; selp.b64 %rd12156, %rd7958, %rd12156, %p1395; selp.b64 %rd12157, %rd7960, %rd12157, %p1395; selp.b64 %rd12158, %rd7962, %rd12158, %p1395; add.s64 %rd7967, %rd12159, 8; selp.b64 %rd12155, %rd7967, %rd12155, %p1395; add.s64 %rd7968, %rd12165, 8; setp.eq.s64 %p1397, %rd12162, %rd12168; selp.b64 %rd7969, %rd7968, %rd12162, %p1397; add.s64 %rd7970, %rd12166, 8; selp.b64 %rd7971, %rd7970, %rd12163, %p1397; add.s64 %rd7972, %rd12167, 8; selp.b64 %rd7973, %rd7972, %rd12164, %p1397; selp.b64 %rd12165, %rd7968, %rd12165, %p1397; selp.b64 %rd12166, %rd7970, %rd12166, %p1397; selp.b64 %rd12167, %rd7972, %rd12167, %p1397; add.s64 %rd7974, %rd12162, 8; selp.b64 %rd12168, %rd7974, %rd12168, %p1397; add.s64 %rd7975, %rd7969, 4; add.s64 %rd7976, %rd7971, 4; add.s64 %rd7977, %rd7973, 4; selp.b64 %rd12162, %rd7969, %rd7975, %p1396; selp.b64 %rd12163, %rd7971, %rd7976, %p1396; selp.b64 %rd12164, %rd7973, %rd7977, %p1396; ld.local.f32 %f3241, [%rd7971]; ld.local.f32 %f3242, [%rd7961]; setp.eq.f32 %p1398, %f3242, %f3241; mov.u64 %rd12159, %rd2419; @%p1398 bra $L__BB1_998; bra.uni $L__BB1_1000; $L__BB1_1001: cvt.u32.u64 %r3070, %rd2392; mov.b32 %f3243, %r3070; shr.u64 %rd7978, %rd2392, 32; cvt.u32.u64 %r3071, %rd7978; mov.b32 %f3244, %r3071; shr.u64 %rd7979, %rd2402, 32; cvt.u32.u64 %r3072, %rd7979; cvt.u32.u64 %r3073, %rd2402; mov.b32 %f3245, %r3073; sub.f32 %f3246, %f3245, %f3243; mov.b32 %f3247, %r3072; sub.f32 %f3248, %f3247, %f3244; neg.f32 %f5397, %f3246; neg.f32 %f5398, %f3248; $L__BB1_1002: mul.f32 %f3249, %f720, %f5398; fma.rn.f32 %f727, %f719, %f5397, %f3249; mul.f32 %f3250, %f5398, %f5398; fma.rn.f32 %f3251, %f5397, %f5397, %f3250; add.f32 %f3252, %f3251, 0f00000000; sqrt.rn.f32 %f3253, %f3252; mul.f32 %f3254, %f3253, 0f3A83126F; abs.f32 %f3255, %f727; setp.gt.f32 %p1399, %f3255, %f3254; @%p1399 bra $L__BB1_1004; bra.uni $L__BB1_1003; $L__BB1_1004: setp.ge.f32 %p2932, %f727, 0f00000000; bra.uni $L__BB1_1007; $L__BB1_1003: ld.local.u64 %rd7980, [%rd2324+8]; cvt.u32.u64 %r3074, %rd7980; mov.b32 %f3256, %r3074; shr.u64 %rd7981, %rd7980, 32; cvt.u32.u64 %r3075, %rd7981; mov.b32 %f3257, %r3075; sub.f32 %f3258, %f518, %f3256; sub.f32 %f3259, %f519, %f3257; mul.f32 %f3260, %f720, %f3259; fma.rn.f32 %f3261, %f719, %f3258, %f3260; setp.le.f32 %p2932, %f3261, 0f00000000; $L__BB1_1007: selp.u16 %rs554, 1, 0, %p2932; st.local.u8 [%rd2324+16], %rs554; $L__BB1_1008: ld.local.v2.u32 {%r5077, %r5078}, [%rd2324+8]; ld.local.u32 %r5079, [%rd2324+16]; $L__BB1_1010: setp.eq.s32 %p1400, %r744, 2; mov.u64 %rd7989, 0; mov.u64 %rd12170, 2; mov.u64 %rd12171, %rd7989; @%p1400 bra $L__BB1_1012; setp.ne.s16 %p1401, %rs95, 0; cvt.u16.u32 %rs556, %r5079; selp.u16 %rs557, 1, 0, %p1401; xor.b16 %rs558, %rs556, %rs557; mov.b32 %f3268, %r5077; mov.b32 %f3269, %r5078; mul.f32 %f3270, %f690, %f3268; ld.global.f32 %f3271, [%rd2320+252]; mul.f32 %f3272, %f3271, %f3269; sub.f32 %f3273, %f3270, %f3272; mul.f32 %f3274, %f3271, %f3268; fma.rn.f32 %f3275, %f690, %f3269, %f3274; add.f32 %f3276, %f688, %f3273; mov.b32 %r3080, %f3276; add.f32 %f3277, %f689, %f3275; mov.b32 %r3081, %f3277; cvt.u64.u32 %rd7990, %r3081; cvt.u64.u32 %rd7991, %r3080; cvt.u64.u16 %rd7992, %rs558; bfi.b64 %rd12171, %rd7990, %rd7991, 32, 32; and.b64 %rd7993, %rd7992, 255; mov.b64 {%r3082, %r3083}, %rd7993; mov.b32 {%rs559, %rs560}, %r3082; cvt.u64.u16 %rd12170, %rs559; $L__BB1_1012: or.b64 %rd7994, %rd7989, %rd7989; or.b64 %rd7995, %rd12170, %rd7989; or.b64 %rd7996, %rd7995, %rd7989; or.b64 %rd7997, %rd7994, %rd12171; mov.b64 {%r5139, %r5140}, %rd7997; mov.b64 {%r5141, %r3084}, %rd7996; $L__BB1_1180: mov.b32 {%rs116, %rs628}, %r5141; and.b16 %rs629, %rs116, 255; setp.eq.s16 %p1646, %rs629, 2; @%p1646 bra $L__BB1_1182; mov.b64 %rd8472, {%r5141, %r3332}; shr.u64 %rd8473, %rd8472, 8; and.b64 %rd8474, %rd8473, 16777215; cvt.u64.u16 %rd8475, %rs116; and.b64 %rd8476, %rd8475, 255; mov.b64 %rd8477, {%r5139, %r5140}; bfi.b64 %rd8478, %rd8474, %rd8476, 8, 56; mov.b64 {%r3333, %r3334}, %rd8478; mov.b32 {%rs630, %rs631}, %r3333; shr.u64 %rd8479, %rd8477, 32; cvt.u32.u64 %r3335, %rd8479; mov.b32 %f3582, %r5139; sub.f32 %f3583, %f3582, %f518; mov.b32 %f3584, %r3335; sub.f32 %f3585, %f3584, %f519; mul.f32 %f3586, %f3585, %f3585; fma.rn.f32 %f3587, %f3583, %f3583, %f3586; add.f32 %f3588, %f3587, 0f00000000; sqrt.rn.f32 %f3589, %f3588; and.b16 %rs632, %rs630, 1; setp.eq.b16 %p1647, %rs632, 1; selp.f32 %f3590, 0fBF800000, 0f3F800000, %p1647; mul.f32 %f3591, %f3590, %f3589; setp.ge.f32 %p1648, %f3591, %f687; setp.le.f32 %p1649, %f3591, %f687; selp.b16 %rs633, 1, 2, %p1649; setp.gtu.f32 %p1650, %f3591, %f687; selp.b16 %rs634, -1, 0, %p1650; selp.b16 %rs635, %rs634, %rs633, %p1648; setp.eq.s16 %p1651, %rs635, 1; selp.f32 %f687, %f3591, %f687, %p1651; $L__BB1_1182: add.s64 %rd2321, %rd2321, 280; setp.ne.s64 %p1652, %rd1745, 0; add.s64 %rd2320, %rd2320, 280; @%p1652 bra $L__BB1_952; $L__BB1_1183: setp.eq.s32 %p1653, %r537, 0; @%p1653 bra $L__BB1_1185; ld.param.f32 %f5236, [grid_update_param_1]; sub.f32 %f3593, %f517, %f687; div.rn.f32 %f3594, %f3593, %f5236; div.rn.f32 %f3595, %f3594, 0f3DCCCCCD; mul.f32 %f858, %f3595, 0f3F000000; $L__BB1_1185: cvta.to.global.u64 %rd3469, %rd5233; add.f32 %f859, %f178, %f2; add.f32 %f860, %f177, %f3; mov.u32 %r3338, 2; mov.u64 %rd8482, 0; mov.u64 %rd3471, %rd5233; mov.u64 %rd2883, %rd8482; mov.u64 %rd8507, %rd8482; @%p423 bra $L__BB1_1422; ld.param.u64 %rd2883, [grid_update_param_3]; add.u64 %rd8483, %SP, 560; add.u64 %rd8484, %SPL, 560; add.s64 %rd2855, %rd8484, 8; add.u64 %rd8487, %SP, 0; add.u64 %rd8488, %SPL, 0; add.s64 %rd2856, %rd8488, 8; add.s64 %rd2857, %rd8488, 8; add.s64 %rd2858, %rd8488, 8; add.s64 %rd2859, %rd8488, 8; add.s64 %rd2860, %rd8488, 8; add.s64 %rd2861, %rd8488, 8; add.u64 %rd8499, %SP, 552; add.u64 %rd8500, %SPL, 552; add.s64 %rd2862, %rd8500, 8; add.u64 %rd8501, %SP, 32; add.u64 %rd8502, %SPL, 32; add.s64 %rd2863, %rd8502, 36; add.s64 %rd2864, %rd8502, 4; add.s64 %rd2865, %rd8501, 36; add.s64 %rd2866, %rd8502, 44; add.s64 %rd2867, %rd8501, 44; add.s64 %rd2868, %rd8502, 52; add.s64 %rd2869, %rd8484, 8; add.s64 %rd2870, %rd8484, 8; or.b64 %rd2871, %rd8483, 8; add.s64 %rd2872, %rd8484, 16; add.s64 %rd2873, %rd7, 32; add.s64 %rd2874, %rd7, 48; add.s64 %rd2875, %rd7, 64; add.s64 %rd2876, %rd7, 80; add.s64 %rd2877, %rd7, 96; add.s64 %rd2878, %rd7, 112; cvta.to.global.u64 %rd12312, %rd5233; mov.u64 %rd12313, %rd5233; $L__BB1_1187: mov.u64 %rd2882, %rd12313; mov.u64 %rd2881, %rd12312; add.s64 %rd2883, %rd2883, -1; setp.eq.s64 %p1655, %rd2882, 0; @%p1655 bra $L__BB1_1421; add.s64 %rd2884, %rd2881, 272; ld.global.u32 %r3342, [%rd2881+272]; mov.u64 %rd8507, 0; setp.eq.s32 %p1656, %r3342, 3; mov.u32 %r3341, 2; @%p1656 bra $L__BB1_1418; ld.global.u16 %rs636, [%rd2884+-272]; setp.eq.s16 %p1657, %rs636, 1; @%p1657 bra $L__BB1_1360; setp.eq.s16 %p1658, %rs636, 2; @%p1658 bra $L__BB1_1249; setp.ne.s16 %p1659, %rs636, 3; @%p1659 bra $L__BB1_1398; ld.global.u8 %rs117, [%rd2884+-248]; ld.global.f32 %f861, [%rd2884+-16]; sub.f32 %f3596, %f859, %f861; ld.global.f32 %f862, [%rd2884+-12]; sub.f32 %f3597, %f860, %f862; ld.global.f32 %f3598, [%rd2884+-20]; ld.global.f32 %f863, [%rd2884+-24]; mul.f32 %f3599, %f3597, %f3598; fma.rn.f32 %f864, %f3596, %f863, %f3599; mul.f32 %f3600, %f3596, %f3598; mul.f32 %f3601, %f3597, %f863; sub.f32 %f865, %f3601, %f3600; cvta.to.local.u64 %rd2885, %rd8501; mov.u32 %r918, 2; st.local.u32 [%rd2885+20], %r918; ld.global.u64 %rd2886, [%rd2884+-256]; setp.eq.s64 %p1660, %rd2886, 0; @%p1660 bra $L__BB1_1246; mov.b32 %r3357, %f865; ld.global.u64 %rd2887, [%rd2884+-264]; mov.b32 %r3358, %f864; and.b32 %r3359, %r3358, 2147483647; mov.b32 %f866, %r3359; and.b32 %r3360, %r3357, 2147483647; mov.b32 %f867, %r3360; mov.u64 %rd12315, 1; bra.uni $L__BB1_1194; $L__BB1_1202: sub.f32 %f3613, %f5416, %f864; abs.f32 %f882, %f3613; setp.le.f32 %p1670, %f882, 0f34000000; @%p1670 bra $L__BB1_1204; abs.f32 %f3614, %f5416; abs.f32 %f3615, %f864; setp.gt.f32 %p1672, %f3615, %f3614; selp.f32 %f3616, %f3615, %f3614, %p1672; mul.f32 %f3617, %f3616, 0f34000000; setp.gtu.f32 %p1673, %f882, %f3617; @%p1673 bra $L__BB1_1208; bra.uni $L__BB1_1204; $L__BB1_1194: shl.b64 %rd8512, %rd12315, 3; add.s64 %rd8513, %rd2887, %rd8512; setp.eq.s64 %p1661, %rd12315, %rd2886; selp.b64 %rd8514, 0, %rd12315, %p1661; shl.b64 %rd8515, %rd8514, 3; add.s64 %rd8516, %rd2887, %rd8515; ld.u32 %rd8517, [%rd8513+-8]; ld.u32 %rd8518, [%rd8513+-4]; bfi.b64 %rd2890, %rd8518, %rd8517, 32, 32; ld.u32 %rd8519, [%rd8516]; ld.u32 %rd8520, [%rd8516+4]; bfi.b64 %rd2891, %rd8520, %rd8519, 32, 32; cvt.u32.u64 %r5143, %rd2890; mov.b32 %f5416, %r5143; shr.u64 %rd8521, %rd2890, 32; cvt.u32.u64 %r3363, %rd8521; mov.b32 %f870, %r3363; cvt.u32.u64 %r902, %rd2891; shr.u64 %rd8522, %rd2891, 32; cvt.u32.u64 %r3364, %rd8522; mov.b32 %f871, %r902; sub.f32 %f872, %f871, %f5416; mov.b32 %f3603, %r3364; sub.f32 %f873, %f3603, %f870; sub.f32 %f3604, %f864, %f5416; sub.f32 %f3605, %f865, %f870; mul.f32 %f3606, %f873, %f3605; fma.rn.f32 %f874, %f872, %f3604, %f3606; mul.f32 %f3607, %f873, %f873; fma.rn.f32 %f3608, %f872, %f872, %f3607; add.f32 %f875, %f3608, 0f00000000; setp.gtu.f32 %p1662, %f874, 0f00000000; mov.b64 {%r3365, %r5144}, %rd2890; mov.b64 {%r3366, %r904}, %rd2891; @%p1662 bra $L__BB1_1196; bra.uni $L__BB1_1195; $L__BB1_1196: setp.ltu.f32 %p1663, %f874, %f875; @%p1663 bra $L__BB1_1198; bra.uni $L__BB1_1197; $L__BB1_1198: setp.eq.f32 %p1664, %f875, 0f00000000; @%p1664 bra $L__BB1_1245; div.rn.f32 %f3609, %f874, %f875; mov.f32 %f3610, 0f3F800000; sub.f32 %f3611, %f3610, %f3609; mov.b32 %r5146, %f3611; mov.b32 %r5147, %f3609; fma.rn.f32 %f5416, %f872, %f3609, %f5416; mov.b32 %r5143, %f5416; fma.rn.f32 %f5417, %f873, %f3609, %f870; mov.b32 %r5144, %f5417; mov.u32 %r5145, 1; bra.uni $L__BB1_1200; $L__BB1_1195: mov.b32 %f5417, %r5144; mov.u32 %r5145, 0; mov.u32 %r5146, %r5145; bra.uni $L__BB1_1200; $L__BB1_1197: mov.b32 %f5417, %r904; mov.u32 %r5146, 1; mov.u32 %r5145, 0; mov.f32 %f5416, %f871; mov.u32 %r5143, %r902; mov.u32 %r5144, %r904; $L__BB1_1200: setp.eq.f32 %p1665, %f864, %f5416; @%p1665 bra $L__BB1_1204; bra.uni $L__BB1_1201; $L__BB1_1204: setp.eq.f32 %p1675, %f5417, %f865; mov.pred %p1674, -1; mov.pred %p2937, %p1674; @%p1675 bra $L__BB1_1208; setp.eq.f32 %p1677, %f867, 0f7F800000; and.b32 %r3375, %r5144, 2147483647; mov.b32 %f3618, %r3375; setp.eq.f32 %p1678, %f3618, 0f7F800000; or.pred %p1679, %p1677, %p1678; mov.pred %p2937, 0; @%p1679 bra $L__BB1_1208; sub.f32 %f3619, %f5417, %f865; abs.f32 %f883, %f3619; setp.le.f32 %p1681, %f883, 0f34000000; mov.pred %p2937, %p1674; @%p1681 bra $L__BB1_1208; abs.f32 %f3620, %f5417; abs.f32 %f3621, %f865; setp.gt.f32 %p1682, %f3621, %f3620; selp.f32 %f3622, %f3621, %f3620, %p1682; mul.f32 %f3623, %f3622, 0f34000000; setp.le.f32 %p2937, %f883, %f3623; bra.uni $L__BB1_1208; $L__BB1_1201: setp.eq.f32 %p1667, %f866, 0f7F800000; and.b32 %r3374, %r5143, 2147483647; mov.b32 %f3612, %r3374; setp.eq.f32 %p1668, %f3612, 0f7F800000; or.pred %p1669, %p1667, %p1668; mov.pred %p2937, 0; @%p1669 bra $L__BB1_1208; bra.uni $L__BB1_1202; $L__BB1_1208: cvt.u64.u32 %rd8523, %r5144; cvt.u64.u32 %rd8524, %r5143; bfi.b64 %rd2892, %rd8523, %rd8524, 32, 32; mov.b64 {%r3376, %r3377}, %rd2892; selp.u64 %rd2893, 1, 0, %p2937; mov.b32 %f885, %r3377; mov.b32 %f884, %r3376; sub.f32 %f3624, %f884, %f864; sub.f32 %f3625, %f885, %f865; mul.f32 %f3626, %f3625, %f3625; fma.rn.f32 %f3627, %f3624, %f3624, %f3626; add.f32 %f3628, %f3627, 0f00000000; sqrt.rn.f32 %f887, %f3628; setp.geu.f32 %p1683, %f887, %f5418; setp.ne.s32 %p1684, %r918, 2; and.pred %p1685, %p1684, %p1683; @%p1685 bra $L__BB1_1210; add.s64 %rd12316, %rd12315, -1; st.local.u64 [%rd2885], %rd12316; st.local.v2.f32 [%rd2885+8], {%f884, %f885}; mov.b64 {%r3380, %r3381}, %rd2893; st.local.v2.u32 [%rd2885+16], {%r3380, %r5145}; st.local.v2.u32 [%rd2885+24], {%r5146, %r5147}; st.local.f32 [%rd2885+32], %f887; st.local.u32 [%rd2885+36], %rd2890; st.local.u32 [%rd2885+44], %rd2891; st.local.u32 [%rd2885+40], %rd8521; st.local.u32 [%rd2885+48], %rd8522; mov.f32 %f5418, %f887; mov.u32 %r918, %r5145; $L__BB1_1210: add.s64 %rd2896, %rd12315, 1; setp.lt.u64 %p1686, %rd12315, %rd2886; mov.u64 %rd12315, %rd2896; @%p1686 bra $L__BB1_1194; ld.local.u32 %rd8531, [%rd2885+36]; ld.local.u32 %rd8532, [%rd2885+40]; bfi.b64 %rd8533, %rd8532, %rd8531, 32, 32; mov.u64 %rd8530, 0; cvt.u32.u64 %r3382, %rd8533; mov.b32 %f3629, %r3382; shr.u64 %rd8534, %rd8533, 32; cvt.u32.u64 %r3383, %rd8534; mov.b32 %f3630, %r3383; ld.local.u32 %rd8535, [%rd2885+44]; ld.local.u32 %rd8536, [%rd2885+48]; bfi.b64 %rd8537, %rd8536, %rd8535, 32, 32; cvt.u32.u64 %r3384, %rd8537; shr.u64 %rd8538, %rd8537, 32; cvt.u32.u64 %r3385, %rd8538; mov.b32 %f3631, %r3384; sub.f32 %f889, %f3631, %f3629; mov.b32 %f3632, %r3385; sub.f32 %f890, %f3632, %f3630; mul.f32 %f3633, %f890, %f890; fma.rn.f32 %f3634, %f889, %f889, %f3633; add.f32 %f891, %f3634, 0f00000000; setp.leu.f32 %p1687, %f891, 0f28800000; mov.u64 %rd12317, %rd8530; mov.u64 %rd12318, %rd8530; mov.u64 %rd12319, %rd8530; @%p1687 bra $L__BB1_1213; neg.f32 %f3635, %f889; sqrt.rn.f32 %f3636, %f891; div.rn.f32 %f3637, %f890, %f3636; div.rn.f32 %f3638, %f3635, %f3636; mov.b32 %r3386, %f3638; mov.b32 %r3387, %f3637; mov.u64 %rd12319, 1; mov.b64 %rd8541, {%r3387, %r3386}; shr.u64 %rd12318, %rd8541, 32; shl.b64 %rd12317, %rd8541, 32; $L__BB1_1213: or.b64 %rd2903, %rd12319, %rd12317; or.b64 %rd2904, %rd8530, %rd12318; and.b64 %rd8542, %rd8530, 4294967295; xor.b64 %rd8543, %rd12319, 1; or.b64 %rd8544, %rd8543, %rd8542; setp.ne.s64 %p1688, %rd8544, 0; @%p1688 bra $L__BB1_1244; mov.b64 {%r3388, %r3389}, %rd2904; mov.b64 {%r3390, %r3391}, %rd2903; mov.b32 %f892, %r3391; mov.b32 %f893, %r3388; setp.eq.s32 %p1689, %r918, 1; @%p1689 bra $L__BB1_1242; bra.uni $L__BB1_1215; $L__BB1_1242: ld.local.u64 %rd8623, [%rd2885+8]; cvt.u32.u64 %r3412, %rd8623; mov.b32 %f3666, %r3412; shr.u64 %rd8624, %rd8623, 32; cvt.u32.u64 %r3413, %rd8624; mov.b32 %f3667, %r3413; sub.f32 %f3668, %f859, %f3666; sub.f32 %f3669, %f860, %f3667; mul.f32 %f3670, %f893, %f3669; fma.rn.f32 %f3671, %f892, %f3668, %f3670; setp.le.f32 %p2938, %f3671, 0f00000000; bra.uni $L__BB1_1243; $L__BB1_1249: ld.global.f32 %f3682, [%rd2884+-16]; mov.u64 %rd8644, 0; sub.f32 %f3683, %f859, %f3682; ld.global.f32 %f3684, [%rd2884+-12]; sub.f32 %f3685, %f860, %f3684; ld.global.f32 %f3686, [%rd2884+-20]; ld.global.f32 %f3687, [%rd2884+-24]; mul.f32 %f3688, %f3685, %f3686; fma.rn.f32 %f901, %f3683, %f3687, %f3688; mul.f32 %f3689, %f3683, %f3686; mul.f32 %f3690, %f3685, %f3687; sub.f32 %f902, %f3690, %f3689; mov.b32 %r3421, %f901; mov.b32 %r3422, %f902; cvt.u64.u32 %rd8645, %r3422; cvt.u64.u32 %rd8646, %r3421; bfi.b64 %rd8647, %rd8645, %rd8646, 32, 32; st.local.u64 [%rd8500], %rd8647; ld.global.u64 %rd3006, [%rd2884+-240]; setp.eq.s64 %p1710, %rd3006, 0; mov.u64 %rd8642, 2; mov.u64 %rd12473, %rd8644; mov.u64 %rd12474, %rd8642; mov.u64 %rd12475, %rd8644; @%p1710 bra $L__BB1_1355; cvta.to.local.u64 %rd3007, %rd8501; mov.u32 %r3429, 0; st.local.u32 [%rd3007], %r3429; mov.u32 %r3430, -16777217; st.local.u32 [%rd3007+4], %r3430; mov.u32 %r940, 1; st.local.u32 [%rd3007+512], %r940; ld.global.u64 %rd3008, [%rd2884+-248]; ld.global.u64 %rd3009, [%rd2884+-192]; ld.global.u64 %rd3010, [%rd2884+-200]; mov.u32 %r938, 2139095039; mov.u32 %r937, 4; bra.uni $L__BB1_1251; $L__BB1_1360: ld.global.f32 %f968, [%rd2884+-16]; sub.f32 %f3886, %f859, %f968; ld.global.f32 %f969, [%rd2884+-12]; sub.f32 %f3887, %f860, %f969; ld.global.f32 %f3888, [%rd2884+-20]; ld.global.f32 %f970, [%rd2884+-24]; mul.f32 %f3889, %f3887, %f3888; fma.rn.f32 %f971, %f3886, %f970, %f3889; mul.f32 %f3890, %f3886, %f3888; mul.f32 %f3891, %f3887, %f970; sub.f32 %f972, %f3891, %f3890; mov.b32 %r1041, %f971; mov.b32 %r1042, %f972; ld.global.u64 %rd3369, [%rd2884+-216]; ld.global.u64 %rd3368, [%rd2884+-224]; sub.f32 %f3892, %f971, %f6; sub.f32 %f3893, %f972, %f6; mov.b32 %r3601, %f3892; mov.b32 %r3602, %f3893; cvt.u64.u32 %rd9034, %r3602; cvt.u64.u32 %rd9035, %r3601; add.f32 %f3894, %f6, %f971; add.f32 %f3895, %f6, %f972; mov.b32 %r3603, %f3894; mov.b32 %r3604, %f3895; cvt.u64.u32 %rd9036, %r3604; cvt.u64.u32 %rd9037, %r3603; bfi.b64 %rd9038, %rd9034, %rd9035, 32, 32; mov.b64 {%r3605, %r3606}, %rd9038; bfi.b64 %rd9039, %rd9036, %rd9037, 32, 32; mov.b64 {%r3607, %r3608}, %rd9039; cvta.to.local.u64 %rd3370, %rd8501; mov.u16 %rs702, 2; st.local.u8 [%rd3370+8], %rs702; mov.b32 %f976, %r3608; mov.b32 %f974, %r3606; mov.b32 %f975, %r3607; mov.b32 %f973, %r3605; ld.global.v2.f32 {%f3896, %f3897}, [%rd2884+-232]; div.rn.f32 %f979, %f973, %f3896; div.rn.f32 %f980, %f975, %f3896; ld.global.u64 %rd3371, [%rd2884+-256]; cvt.rn.f32.u64 %f3898, %rd3371; add.f32 %f3899, %f3898, 0fBF800000; rcp.rn.f32 %f981, %f3899; setp.lt.f32 %p1877, %f980, 0fBF000000; setp.gt.f32 %p1878, %f979, 0f3F000000; or.pred %p1879, %p1878, %p1877; @%p1879 bra $L__BB1_1392; add.f32 %f3900, %f979, 0f3F000000; div.rn.f32 %f3901, %f3900, %f981; cvt.rmi.f32.f32 %f3902, %f3901; add.s64 %rd9041, %rd3371, -2; cvt.rn.f32.u64 %f3903, %rd9041; setp.gt.f32 %p1880, %f3902, 0f00000000; setp.lt.f32 %p1881, %f3902, %f3903; selp.f32 %f3904, %f3902, %f3903, %p1881; selp.f32 %f3905, %f3904, 0f00000000, %p1880; setp.gt.f32 %p1882, %f3905, 0f5F7FFFFF; max.f32 %f3906, %f3905, 0f00000000; cvt.rzi.u64.f32 %rd9042, %f3906; selp.b64 %rd3377, -1, %rd9042, %p1882; add.f32 %f3907, %f980, 0f3F000000; div.rn.f32 %f3908, %f3907, %f981; cvt.rpi.f32.f32 %f3909, %f3908; add.s64 %rd9043, %rd3371, -1; cvt.rn.f32.u64 %f3910, %rd9043; setp.gt.f32 %p1883, %f3909, 0f00000000; setp.lt.f32 %p1884, %f3909, %f3910; selp.f32 %f3911, %f3909, %f3910, %p1884; selp.f32 %f3912, %f3911, 0f00000000, %p1883; setp.gt.f32 %p1885, %f3912, 0f5F7FFFFF; max.f32 %f3913, %f3912, 0f00000000; cvt.rzi.u64.f32 %rd9044, %f3913; selp.b64 %rd3373, -1, %rd9044, %p1885; setp.ge.u64 %p1886, %rd3377, %rd3373; @%p1886 bra $L__BB1_1392; div.rn.f32 %f982, %f974, %f3897; div.rn.f32 %f983, %f976, %f3897; ld.global.u64 %rd3374, [%rd2884+-240]; ld.global.u64 %rd3375, [%rd2884+-248]; ld.global.u64 %rd3376, [%rd2884+-264]; and.b32 %r3609, %r1041, 2147483647; mov.b32 %f984, %r3609; and.b32 %r3610, %r1042, 2147483647; mov.b32 %f985, %r3610; ld.local.v4.u32 {%r5208, %r5209, %r5210, %r3614}, [%rd3370]; mov.f32 %f5430, 0f7F7FFFFF; bra.uni $L__BB1_1363; $L__BB1_1398: ld.global.f32 %f1010, [%rd2884+-16]; sub.f32 %f3955, %f859, %f1010; ld.global.f32 %f1011, [%rd2884+-12]; sub.f32 %f3956, %f860, %f1011; ld.global.f32 %f1012, [%rd2884+-20]; ld.global.f32 %f1013, [%rd2884+-24]; mul.f32 %f3957, %f3956, %f1012; fma.rn.f32 %f1014, %f3955, %f1013, %f3957; mul.f32 %f3958, %f3955, %f1012; mul.f32 %f3959, %f3956, %f1013; sub.f32 %f1015, %f3959, %f3958; ld.global.u32 %rd9071, [%rd2884+-264]; ld.global.u32 %rd9072, [%rd2884+-260]; bfi.b64 %rd9073, %rd9072, %rd9071, 32, 32; cvt.u32.u64 %r3653, %rd9073; mov.b32 %f3960, %r3653; shr.u64 %rd9074, %rd9073, 32; cvt.u32.u64 %r3654, %rd9074; mov.b32 %f3961, %r3654; neg.f32 %f3962, %f3960; neg.f32 %f3963, %f3961; sub.f32 %f1016, %f3962, %f1014; sub.f32 %f1017, %f3963, %f1015; sub.f32 %f1018, %f1014, %f3960; sub.f32 %f1019, %f1015, %f3961; setp.ge.f32 %p1935, %f1016, 0f00000000; selp.f32 %f3964, %f1016, 0f00000000, %p1935; setp.ge.f32 %p1936, %f1017, 0f00000000; selp.f32 %f3965, %f1017, 0f00000000, %p1936; setp.ge.f32 %p1937, %f1018, 0f00000000; selp.f32 %f3966, %f1018, 0f00000000, %p1937; setp.ge.f32 %p1938, %f1019, 0f00000000; selp.f32 %f3967, %f1019, 0f00000000, %p1938; sub.f32 %f1020, %f3964, %f3966; mov.b32 %r3655, %f1020; sub.f32 %f1021, %f3965, %f3967; mov.b32 %r3656, %f1021; cvt.u64.u32 %rd9075, %r3656; cvt.u64.u32 %rd9076, %r3655; bfi.b64 %rd9077, %rd9075, %rd9076, 32, 32; st.local.u64 [%rd8484], %rd9077; mov.u64 %rd12489, 2; mov.u64 %rd12482, %rd2855; mov.u64 %rd12483, %rd8484; mov.u64 %rd12484, %rd8484; mov.u64 %rd12485, %rd8483; mov.u64 %rd12486, %rd8484; mov.u64 %rd12487, %rd8484; mov.u64 %rd12488, %rd8483; $L__BB1_1399: setp.eq.s64 %p1939, %rd12489, 0; @%p1939 bra $L__BB1_1402; add.s64 %rd12489, %rd12489, -1; add.s64 %rd9078, %rd12486, 8; setp.eq.s64 %p1940, %rd12486, %rd12482; selp.b64 %rd12482, %rd9078, %rd12482, %p1940; add.s64 %rd9079, %rd12483, 8; selp.b64 %rd12483, %rd9079, %rd12483, %p1940; add.s64 %rd9080, %rd12484, 8; selp.b64 %rd12484, %rd9080, %rd12484, %p1940; add.s64 %rd9081, %rd12485, 8; selp.b64 %rd12485, %rd9081, %rd12485, %p1940; selp.b64 %rd9082, %rd9079, %rd12486, %p1940; selp.b64 %rd9083, %rd9080, %rd12487, %p1940; selp.b64 %rd9084, %rd9081, %rd12488, %p1940; setp.eq.s64 %p1941, %rd12489, 0; add.s64 %rd9085, %rd9082, 4; add.s64 %rd9086, %rd9083, 4; add.s64 %rd9087, %rd9084, 4; selp.b64 %rd12486, %rd9082, %rd9085, %p1941; selp.b64 %rd12487, %rd9083, %rd9086, %p1941; selp.b64 %rd12488, %rd9084, %rd9087, %p1941; ld.local.f32 %f3968, [%rd9083]; setp.eq.f32 %p1942, %f3968, 0f00000000; @%p1942 bra $L__BB1_1399; add.f32 %f3969, %f1014, %f1020; mov.b32 %r3657, %f3969; add.f32 %f3970, %f1015, %f1021; mov.b32 %r3658, %f3970; cvt.u64.u32 %rd9090, %r3658; cvt.u64.u32 %rd9091, %r3657; bfi.b64 %rd12492, %rd9090, %rd9091, 32, 32; mov.u64 %rd12493, 0; bra.uni $L__BB1_1415; $L__BB1_1402: setp.lt.f32 %p1943, %f1016, %f1018; mov.f32 %f5431, 0fFF7FFFFF; @%p1943 bra $L__BB1_1405; bra.uni $L__BB1_1403; $L__BB1_1405: setp.leu.f32 %p1948, %f1018, 0fFF7FFFFF; mov.pred %p2942, 0; @%p1948 bra $L__BB1_1407; mov.f32 %f5431, %f1018; bra.uni $L__BB1_1407; $L__BB1_1403: setp.leu.f32 %p1945, %f1016, 0fFF7FFFFF; mov.pred %p2942, 0; @%p1945 bra $L__BB1_1407; mov.pred %p2942, -1; mov.f32 %f5431, %f1016; $L__BB1_1407: setp.lt.f32 %p1950, %f1017, %f1019; @%p1950 bra $L__BB1_1410; bra.uni $L__BB1_1408; $L__BB1_1410: setp.gt.f32 %p1952, %f1019, %f5431; @%p1952 bra $L__BB1_1413; bra.uni $L__BB1_1411; $L__BB1_1413: cvta.to.local.u64 %rd9098, %rd8501; mov.u64 %rd9099, 0; st.local.u64 [%rd9098], %rd9099; neg.f32 %f5433, %f1019; mov.u64 %rd12491, %rd2864; bra.uni $L__BB1_1414; $L__BB1_1408: setp.leu.f32 %p1951, %f1017, %f5431; @%p1951 bra $L__BB1_1411; mov.u64 %rd9094, 0; st.local.u64 [%rd8502], %rd9094; mov.u64 %rd12491, %rd2864; mov.f32 %f5431, %f1017; bra.uni $L__BB1_1412; $L__BB1_1411: mov.u64 %rd9096, 0; st.local.u64 [%rd8502], %rd9096; neg.f32 %f5433, %f5431; not.pred %p1953, %p2942; mov.u64 %rd12491, %rd8502; @%p1953 bra $L__BB1_1414; $L__BB1_1412: mov.f32 %f5433, %f5431; $L__BB1_1414: st.local.f32 [%rd12491], %f5433; ld.local.u64 %rd9104, [%rd8502]; cvt.u32.u64 %r3659, %rd9104; mov.b32 %f3973, %r3659; shr.u64 %rd9105, %rd9104, 32; cvt.u32.u64 %r3660, %rd9105; mov.b32 %f3974, %r3660; add.f32 %f3975, %f1014, %f3973; add.f32 %f3976, %f1015, %f3974; mov.b32 %r3661, %f3975; mov.b32 %r3662, %f3976; cvt.u64.u32 %rd9106, %r3662; cvt.u64.u32 %rd9107, %r3661; bfi.b64 %rd12492, %rd9106, %rd9107, 32, 32; mov.u64 %rd12493, 1; $L__BB1_1415: mov.u64 %rd11207, 0; cvt.u32.u64 %r3663, %rd12492; mov.b32 %f3977, %r3663; shr.u64 %rd9108, %rd12492, 32; cvt.u32.u64 %r3664, %rd9108; mov.b32 %f3978, %r3664; mul.f32 %f3979, %f1013, %f3977; mul.f32 %f3980, %f1012, %f3978; sub.f32 %f3981, %f3979, %f3980; mul.f32 %f3982, %f1013, %f3978; fma.rn.f32 %f3983, %f1012, %f3977, %f3982; add.f32 %f3984, %f1010, %f3981; mov.b32 %r3665, %f3984; add.f32 %f3985, %f1011, %f3983; mov.b32 %r3666, %f3985; cvt.u64.u32 %rd9109, %r3666; cvt.u64.u32 %rd9110, %r3665; bfi.b64 %rd9111, %rd9109, %rd9110, 32, 32; or.b64 %rd9112, %rd11207, %rd9111; mov.b64 {%r5211, %r5212}, %rd9112; mov.b64 {%r5213, %r3667}, %rd12493; bra.uni $L__BB1_1416; $L__BB1_1380: sub.f32 %f3926, %f5428, %f971; abs.f32 %f1003, %f3926; setp.le.f32 %p1905, %f1003, 0f34000000; @%p1905 bra $L__BB1_1382; abs.f32 %f3927, %f5428; abs.f32 %f3928, %f971; setp.gt.f32 %p1907, %f3928, %f3927; selp.f32 %f3929, %f3928, %f3927, %p1907; mul.f32 %f3930, %f3929, 0f34000000; setp.gtu.f32 %p1908, %f1003, %f3930; @%p1908 bra $L__BB1_1386; bra.uni $L__BB1_1382; $L__BB1_1363: setp.gt.u64 %p1887, %rd3374, %rd3377; @%p1887 bra $L__BB1_1365; bra.uni $L__BB1_1364; $L__BB1_1365: add.s64 %rd9045, %rd3375, %rd3377; ld.u8 %rs703, [%rd9045]; setp.eq.s16 %p1888, %rs703, 0; @%p1888 bra $L__BB1_1390; cvt.rn.f32.u64 %f3915, %rd3377; fma.rn.f32 %f987, %f981, %f3915, 0fBF000000; setp.gt.u64 %p1889, %rd3371, %rd3377; @%p1889 bra $L__BB1_1368; bra.uni $L__BB1_1367; $L__BB1_1368: shl.b64 %rd9046, %rd3377, 2; add.s64 %rd3378, %rd3376, %rd9046; ld.f32 %f988, [%rd3378]; add.s64 %rd9047, %rd3377, 1; setp.gt.u64 %p1890, %rd3371, %rd9047; @%p1890 bra $L__BB1_1370; bra.uni $L__BB1_1369; $L__BB1_1370: ld.f32 %f989, [%rd3378+4]; setp.gt.f32 %p1891, %f989, %f983; setp.gt.f32 %p1892, %f988, %f983; and.pred %p1893, %p1892, %p1891; @%p1893 bra $L__BB1_1390; setp.lt.f32 %p1894, %f988, %f982; setp.lt.f32 %p1895, %f989, %f982; and.pred %p1896, %p1894, %p1895; @%p1896 bra $L__BB1_1390; mul.f32 %f3916, %f3896, %f987; mov.b32 %r3615, %f3916; mul.f32 %f992, %f3897, %f988; mov.b32 %r3616, %f992; cvt.u64.u32 %rd9048, %r3616; cvt.u64.u32 %rd9049, %r3615; add.f32 %f3917, %f981, %f987; mul.f32 %f990, %f3896, %f3917; mov.b32 %r1049, %f990; mul.f32 %f3918, %f3897, %f989; mov.b32 %r3617, %f3918; cvt.u64.u32 %rd9050, %r3617; cvt.u64.u32 %rd9051, %r1049; bfi.b64 %rd9052, %rd9050, %rd9051, 32, 32; bfi.b64 %rd9053, %rd9048, %rd9049, 32, 32; cvt.u32.u64 %r5206, %rd9053; mov.b32 %f5428, %r5206; sub.f32 %f993, %f990, %f5428; sub.f32 %f994, %f3918, %f992; sub.f32 %f3919, %f971, %f5428; sub.f32 %f3920, %f972, %f992; mul.f32 %f3921, %f994, %f3920; fma.rn.f32 %f995, %f993, %f3919, %f3921; mul.f32 %f3922, %f994, %f994; fma.rn.f32 %f3923, %f993, %f993, %f3922; add.f32 %f996, %f3923, 0f00000000; setp.gtu.f32 %p1897, %f995, 0f00000000; mov.b64 {%r3618, %r5207}, %rd9053; mov.b64 {%r3619, %r1052}, %rd9052; @%p1897 bra $L__BB1_1374; bra.uni $L__BB1_1373; $L__BB1_1374: setp.ltu.f32 %p1898, %f995, %f996; @%p1898 bra $L__BB1_1376; bra.uni $L__BB1_1375; $L__BB1_1376: setp.eq.f32 %p1899, %f996, 0f00000000; @%p1899 bra $L__BB1_1389; div.rn.f32 %f3924, %f995, %f996; fma.rn.f32 %f5428, %f993, %f3924, %f5428; mov.b32 %r5206, %f5428; fma.rn.f32 %f5429, %f994, %f3924, %f992; mov.b32 %r5207, %f5429; bra.uni $L__BB1_1378; $L__BB1_1373: mov.b32 %f5429, %r5207; bra.uni $L__BB1_1378; $L__BB1_1375: mov.b32 %f5429, %r1052; mov.f32 %f5428, %f990; mov.u32 %r5206, %r1049; mov.u32 %r5207, %r1052; $L__BB1_1378: setp.eq.f32 %p1900, %f971, %f5428; @%p1900 bra $L__BB1_1382; bra.uni $L__BB1_1379; $L__BB1_1382: setp.eq.f32 %p1910, %f5429, %f972; mov.pred %p1909, -1; mov.pred %p2940, %p1909; @%p1910 bra $L__BB1_1386; setp.eq.f32 %p1912, %f985, 0f7F800000; and.b32 %r3621, %r5207, 2147483647; mov.b32 %f3931, %r3621; setp.eq.f32 %p1913, %f3931, 0f7F800000; or.pred %p1914, %p1912, %p1913; mov.pred %p2940, 0; @%p1914 bra $L__BB1_1386; sub.f32 %f3932, %f5429, %f972; abs.f32 %f1004, %f3932; setp.le.f32 %p1916, %f1004, 0f34000000; mov.pred %p2940, %p1909; @%p1916 bra $L__BB1_1386; abs.f32 %f3933, %f5429; abs.f32 %f3934, %f972; setp.gt.f32 %p1917, %f3934, %f3933; selp.f32 %f3935, %f3934, %f3933, %p1917; mul.f32 %f3936, %f3935, 0f34000000; setp.le.f32 %p2940, %f1004, %f3936; bra.uni $L__BB1_1386; $L__BB1_1379: setp.eq.f32 %p1902, %f984, 0f7F800000; and.b32 %r3620, %r5206, 2147483647; mov.b32 %f3925, %r3620; setp.eq.f32 %p1903, %f3925, 0f7F800000; or.pred %p1904, %p1902, %p1903; mov.pred %p2940, 0; @%p1904 bra $L__BB1_1386; bra.uni $L__BB1_1380; $L__BB1_1386: cvt.u64.u32 %rd9054, %r5207; cvt.u64.u32 %rd9055, %r5206; bfi.b64 %rd3379, %rd9054, %rd9055, 32, 32; mov.b64 {%r3622, %r3623}, %rd3379; selp.u64 %rd3380, 1, 0, %p2940; mov.b32 %f3937, %r3622; sub.f32 %f3938, %f3937, %f971; mov.b32 %f3939, %r3623; sub.f32 %f3940, %f3939, %f972; mul.f32 %f3941, %f3940, %f3940; fma.rn.f32 %f3942, %f3938, %f3938, %f3941; add.f32 %f1005, %f3942, 0f00000000; setp.geu.f32 %p1918, %f1005, %f5430; @%p1918 bra $L__BB1_1390; sqrt.rn.f32 %f3943, %f1005; setp.gtu.f32 %p1919, %f3943, %f6; mov.f32 %f5430, %f1005; @%p1919 bra $L__BB1_1390; mov.b64 {%r5210, %r3624}, %rd3380; mov.u32 %r5208, %r3622; mov.u32 %r5209, %r3623; mov.f32 %f5430, %f1005; $L__BB1_1390: add.s64 %rd3377, %rd3377, 1; setp.lt.u64 %p1920, %rd3377, %rd3373; @%p1920 bra $L__BB1_1363; st.local.u32 [%rd3370+8], %r5210; mov.b64 %rd9056, {%r5208, %r5209}; st.local.u64 [%rd3370], %rd9056; $L__BB1_1392: cvt.u64.u32 %rd9057, %r1041; cvt.u64.u32 %rd9058, %r1042; bfi.b64 %rd3382, %rd9058, %rd9057, 32, 32; ld.local.v4.u32 {%r3628, %r3629, %r3630, %r3631}, [%rd3370]; mov.b64 %rd3384, {%r3630, %r3631}; mov.b64 %rd3383, {%r3628, %r3629}; mov.b32 {%rs704, %rs705}, %r3630; and.b16 %rs706, %rs704, 255; setp.eq.s16 %p1921, %rs706, 2; cvt.u64.u16 %rd9059, %rs704; and.b64 %rd9060, %rd9059, 255; selp.b64 %rd9061, 2, %rd9060, %p1921; and.b64 %rd9062, %rd3384, 4294967040; or.b64 %rd9063, %rd9062, %rd9061; mov.b64 {%r3636, %r3637}, %rd9063; mov.b32 {%rs1033, %rs707}, %r3636; and.b16 %rs708, %rs1033, 255; setp.eq.s16 %p1922, %rs708, 2; mov.u32 %r5213, 2; mov.u32 %r5211, 0; mov.u32 %r5212, %r5211; @%p1922 bra $L__BB1_1416; ld.global.u8 %rs709, [%rd2884+-208]; setp.eq.s16 %p1923, %rs709, 0; shr.u64 %rd9064, %rd3383, 32; cvt.u32.u64 %r3638, %rd9064; mov.b32 %f1007, %r3638; @%p1923 bra $L__BB1_1397; mov.b64 {%r3639, %r3640}, %rd3382; mov.b32 %f1009, %r3640; mov.b32 %f1008, %r3639; mov.b64 {%r3641, %r3642}, %rd3368; mov.b64 {%r3643, %r3644}, %rd3369; ld.global.u8 %rs135, [%rd2884+-207]; mov.b32 %f3944, %r3643; setp.gt.f32 %p1925, %f1008, %f3944; mov.b32 %f3945, %r3641; setp.lt.f32 %p1926, %f1008, %f3945; or.pred %p1927, %p1926, %p1925; mov.pred %p2941, 0; @%p1927 bra $L__BB1_1396; setp.geu.f32 %p1928, %f1009, 0fFF7FFFFF; setp.leu.f32 %p1929, %f1009, 0f7F7FFFFF; and.pred %p2941, %p1929, %p1928; $L__BB1_1396: setp.ge.f32 %p1930, %f972, %f1007; setp.le.f32 %p1931, %f972, %f1007; setp.eq.s16 %p1932, %rs135, 0; selp.u32 %r3645, -1, 0, %p1930; selp.u32 %r3646, -1, 0, %p1931; selp.b32 %r3647, %r3646, %r3645, %p1932; and.b32 %r3648, %r3647, 1; setp.eq.b32 %p1933, %r3648, 1; and.pred %p1934, %p1933, %p2941; selp.u16 %rs1033, 1, 0, %p1934; $L__BB1_1397: cvt.u32.u64 %r3649, %rd3383; mov.b32 %f3946, %r3649; mul.f32 %f3947, %f970, %f3946; ld.global.f32 %f3948, [%rd2884+-20]; mul.f32 %f3949, %f3948, %f1007; sub.f32 %f3950, %f3947, %f3949; mul.f32 %f3951, %f3948, %f3946; fma.rn.f32 %f3952, %f970, %f1007, %f3951; add.f32 %f3953, %f968, %f3950; mov.b32 %r3650, %f3953; add.f32 %f3954, %f969, %f3952; mov.b32 %r3651, %f3954; cvt.u64.u32 %rd9065, %r3651; cvt.u64.u32 %rd9066, %r3650; cvt.u64.u16 %rd9067, %rs1033; bfi.b64 %rd9068, %rd9065, %rd9066, 32, 32; and.b64 %rd9069, %rd9067, 255; mov.b64 {%r5211, %r5212}, %rd9068; mov.b64 {%r5213, %r3652}, %rd9069; bra.uni $L__BB1_1416; $L__BB1_1215: ld.local.u32 %r3392, [%rd2885+24]; setp.eq.s32 %p1690, %r3392, 0; @%p1690 bra $L__BB1_1228; setp.ne.s32 %p1691, %r3392, 1; @%p1691 bra $L__BB1_1241; add.s64 %rd2905, %rd12316, 1; or.b64 %rd8545, %rd2905, %rd2886; and.b64 %rd8546, %rd8545, -4294967296; setp.eq.s64 %p1692, %rd8546, 0; @%p1692 bra $L__BB1_1219; rem.u64 %rd12320, %rd2905, %rd2886; bra.uni $L__BB1_1220; $L__BB1_1228: setp.eq.s64 %p1699, %rd12316, 0; selp.b64 %rd2952, %rd2886, %rd12316, %p1699; add.s64 %rd8585, %rd2952, -1; setp.gt.u64 %p1700, %rd2886, %rd8585; @%p1700 bra $L__BB1_1230; bra.uni $L__BB1_1229; $L__BB1_1230: shl.b64 %rd8586, %rd2952, 3; add.s64 %rd8587, %rd2887, %rd8586; ld.u32 %rd8588, [%rd8587+-8]; ld.u32 %rd8589, [%rd8587+-4]; bfi.b64 %rd2953, %rd8589, %rd8588, 32, 32; or.b64 %rd8590, %rd2952, %rd2886; and.b64 %rd8591, %rd8590, -4294967296; setp.eq.s64 %p1701, %rd8591, 0; @%p1701 bra $L__BB1_1232; rem.u64 %rd12337, %rd2952, %rd2886; bra.uni $L__BB1_1233; $L__BB1_1346: ld.u32 %r3578, [%rd3018+76]; cvt.u64.u32 %rd8973, %r3578; setp.le.u64 %p1867, %rd3009, %rd8973; mul.wide.u32 %rd8974, %r3578, 12; add.s64 %rd8975, %rd3010, %rd8974; setp.eq.s64 %p1868, %rd8975, 0; or.pred %p1869, %p1867, %p1868; selp.b32 %r935, %r935, %r5167, %p1869; selp.b32 %r934, %r934, %r5166, %p1869; selp.b32 %r933, %r933, %r5165, %p1869; selp.b32 %r937, %r937, %r5180, %p1869; selp.b32 %r938, %r938, %r987, %p1869; $L__BB1_1251: mov.b32 %f903, %r938; $L__BB1_1252: mov.u32 %r939, %r940; setp.eq.s32 %p1711, %r939, 0; @%p1711 bra $L__BB1_1353; cvt.u64.u32 %rd8654, %r939; add.s64 %rd8655, %rd8654, -1; cvt.u32.u64 %r940, %rd8655; st.local.u32 [%rd3007+512], %r940; mul.wide.u32 %rd8656, %r939, 8; add.s64 %rd8657, %rd3007, %rd8656; ld.local.u32 %rd3016, [%rd8657+-4]; ld.local.u32 %rd8658, [%rd8657+-8]; shl.b64 %rd8659, %rd8658, 32; or.b64 %rd3015, %rd8659, 1; mov.b64 {%r3434, %r3435}, %rd3016; mov.b32 %f3691, %r3434; neg.f32 %f3692, %f3691; setp.le.f32 %p1712, %f903, %f3692; @%p1712 bra $L__BB1_1252; mov.b64 {%r3436, %r3437}, %rd3015; cvt.u64.u32 %rd3017, %r3437; setp.gt.u64 %p1713, %rd3006, %rd3017; @%p1713 bra $L__BB1_1256; bra.uni $L__BB1_1255; $L__BB1_1256: mul.lo.s64 %rd8660, %rd3017, 96; add.s64 %rd3018, %rd3008, %rd8660; ld.u8 %rs644, [%rd3018+88]; and.b16 %rs645, %rs644, 1; setp.eq.b16 %p1715, %rs645, 1; mov.pred %p2939, 0; xor.pred %p1716, %p1715, %p2939; not.pred %p1717, %p1716; @%p1717 bra $L__BB1_1258; ld.v4.u32 {%r3438, %r3439, %r3440, %r3441}, [%rd3018+64]; cvt.u64.u32 %rd8661, %r3438; setp.gt.u64 %p1719, %rd3009, %rd8661; mul.wide.u32 %rd8662, %r3438, 12; add.s64 %rd8663, %rd3010, %rd8662; selp.b64 %rd8664, %rd8663, 0, %p1719; setp.eq.s64 %p1720, %rd8664, 0; add.s64 %rd8665, %rd8664, 8; selp.b64 %rd12358, 0, %rd8665, %p1720; cvt.u64.u32 %rd8666, %r3439; setp.gt.u64 %p1721, %rd3009, %rd8666; mul.wide.u32 %rd8667, %r3439, 12; add.s64 %rd8668, %rd3010, %rd8667; selp.b64 %rd8669, %rd8668, 0, %p1721; setp.eq.s64 %p1722, %rd8669, 0; add.s64 %rd8670, %rd8669, 8; selp.b64 %rd12357, 0, %rd8670, %p1722; ld.u32 %r3445, [%rd3018+72]; cvt.u64.u32 %rd8671, %r3445; setp.gt.u64 %p1723, %rd3009, %rd8671; mul.wide.u32 %rd8672, %r3445, 12; add.s64 %rd8673, %rd3010, %rd8672; selp.b64 %rd8674, %rd8673, 0, %p1723; setp.eq.s64 %p1724, %rd8674, 0; add.s64 %rd8675, %rd8674, 8; selp.b64 %rd12356, 0, %rd8675, %p1724; cvt.u64.u32 %rd8676, %r3441; setp.gt.u64 %p1725, %rd3009, %rd8676; mul.wide.u32 %rd8677, %r3441, 12; add.s64 %rd8678, %rd3010, %rd8677; selp.b64 %rd8679, %rd8678, 0, %p1725; setp.eq.s64 %p1726, %rd8679, 0; add.s64 %rd8680, %rd8679, 8; selp.b64 %rd12355, 0, %rd8680, %p1726; mov.pred %p2939, -1; $L__BB1_1258: ld.v4.f32 {%f3693, %f3694, %f3695, %f3696}, [%rd3018]; sub.f32 %f3701, %f3693, %f901; sub.f32 %f3702, %f3694, %f901; sub.f32 %f3703, %f3695, %f901; sub.f32 %f3704, %f3696, %f901; ld.v4.f32 {%f3705, %f3706, %f3707, %f3708}, [%rd3018+16]; sub.f32 %f3713, %f3705, %f902; sub.f32 %f3714, %f3706, %f902; sub.f32 %f3715, %f3707, %f902; sub.f32 %f3716, %f3708, %f902; ld.v4.f32 {%f3717, %f3718, %f3719, %f3720}, [%rd3018+32]; sub.f32 %f3725, %f901, %f3717; sub.f32 %f3726, %f901, %f3718; sub.f32 %f3727, %f901, %f3719; sub.f32 %f3728, %f901, %f3720; ld.v4.f32 {%f3729, %f3730, %f3731, %f3732}, [%rd3018+48]; sub.f32 %f3737, %f902, %f3729; sub.f32 %f3738, %f902, %f3730; sub.f32 %f3739, %f902, %f3731; sub.f32 %f3740, %f902, %f3732; setp.ge.f32 %p1727, %f3701, %f3725; selp.f32 %f3741, %f3701, %f3725, %p1727; setp.ge.f32 %p1728, %f3702, %f3726; selp.f32 %f3742, %f3702, %f3726, %p1728; setp.ge.f32 %p1729, %f3703, %f3727; selp.f32 %f3743, %f3703, %f3727, %p1729; setp.ge.f32 %p1730, %f3704, %f3728; selp.f32 %f3744, %f3704, %f3728, %p1730; setp.ge.f32 %p1731, %f3713, %f3737; selp.f32 %f3745, %f3713, %f3737, %p1731; setp.ge.f32 %p1732, %f3714, %f3738; selp.f32 %f3746, %f3714, %f3738, %p1732; setp.ge.f32 %p1733, %f3715, %f3739; selp.f32 %f3747, %f3715, %f3739, %p1733; setp.ge.f32 %p1734, %f3716, %f3740; selp.f32 %f3748, %f3716, %f3740, %p1734; setp.ge.f32 %p1735, %f3741, 0f00000000; selp.f32 %f3749, %f3741, 0f00000000, %p1735; setp.ge.f32 %p1736, %f3742, 0f00000000; selp.f32 %f3750, %f3742, 0f00000000, %p1736; setp.ge.f32 %p1737, %f3743, 0f00000000; selp.f32 %f3751, %f3743, 0f00000000, %p1737; setp.ge.f32 %p1738, %f3744, 0f00000000; selp.f32 %f3752, %f3744, 0f00000000, %p1738; mov.b32 %r3446, %f3749; mov.b32 %r3447, %f3750; mov.b32 %r3448, %f3751; mov.b32 %r3449, %f3752; cvt.u64.u32 %rd8681, %r3449; cvt.u64.u32 %rd8682, %r3447; cvt.u64.u32 %rd8683, %r3446; cvt.u64.u32 %rd8684, %r3448; bfi.b64 %rd8685, %rd8681, %rd8684, 32, 32; bfi.b64 %rd8686, %rd8682, %rd8683, 32, 32; setp.ge.f32 %p1739, %f3745, 0f00000000; selp.f32 %f3753, %f3745, 0f00000000, %p1739; setp.ge.f32 %p1740, %f3746, 0f00000000; selp.f32 %f3754, %f3746, 0f00000000, %p1740; setp.ge.f32 %p1741, %f3747, 0f00000000; selp.f32 %f3755, %f3747, 0f00000000, %p1741; setp.ge.f32 %p1742, %f3748, 0f00000000; selp.f32 %f3756, %f3748, 0f00000000, %p1742; mov.b32 %r3450, %f3753; mov.b32 %r3451, %f3754; mov.b32 %r3452, %f3755; mov.b32 %r3453, %f3756; cvt.u64.u32 %rd8687, %r3453; cvt.u64.u32 %rd8688, %r3451; cvt.u64.u32 %rd8689, %r3450; cvt.u64.u32 %rd8690, %r3452; bfi.b64 %rd8691, %rd8687, %rd8690, 32, 32; bfi.b64 %rd8692, %rd8688, %rd8689, 32, 32; mov.b64 {%r3454, %r3455}, %rd8686; mov.b64 {%r3456, %r3457}, %rd8685; cvt.u64.u32 %rd8693, %r3457; cvt.u64.u32 %rd8694, %r3455; cvt.u64.u32 %rd8695, %r3456; bfi.b64 %rd8696, %rd8693, %rd8695, 32, 32; mov.b64 {%r3458, %r3459}, %rd8696; bfi.b64 %rd8697, %rd8694, %rd8683, 32, 32; mov.b64 {%r3460, %r3461}, %rd8697; mov.b32 %f3757, %r3460; mov.b32 %f3758, %r3461; mov.b32 %f3759, %r3458; mov.b32 %f3760, %r3459; mov.b32 %f3761, %r3454; mov.b32 %f3762, %r3455; mov.b32 %f3763, %r3456; mov.b32 %f3764, %r3457; mov.b64 {%r3462, %r3463}, %rd8692; mov.b64 {%r3464, %r3465}, %rd8691; cvt.u64.u32 %rd8698, %r3465; cvt.u64.u32 %rd8699, %r3463; cvt.u64.u32 %rd8700, %r3464; bfi.b64 %rd8701, %rd8698, %rd8700, 32, 32; mov.b64 {%r3466, %r3467}, %rd8701; bfi.b64 %rd8702, %rd8699, %rd8689, 32, 32; mov.b64 {%r3468, %r3469}, %rd8702; mov.b32 %f3765, %r3468; mov.b32 %f3766, %r3469; mov.b32 %f3767, %r3466; mov.b32 %f3768, %r3467; mov.b32 %f3769, %r3462; mov.b32 %f3770, %r3463; mov.b32 %f3771, %r3464; mov.b32 %f3772, %r3465; mul.f32 %f3773, %f3769, %f3765; mul.f32 %f3774, %f3770, %f3766; mul.f32 %f3775, %f3771, %f3767; mul.f32 %f3776, %f3772, %f3768; fma.rn.f32 %f3777, %f3761, %f3757, %f3773; fma.rn.f32 %f3778, %f3762, %f3758, %f3774; fma.rn.f32 %f3779, %f3763, %f3759, %f3775; fma.rn.f32 %f3780, %f3764, %f3760, %f3776; add.f32 %f3781, %f3777, 0f00000000; add.f32 %f3782, %f3778, 0f00000000; add.f32 %f3783, %f3779, 0f00000000; add.f32 %f3784, %f3780, 0f00000000; sqrt.rn.f32 %f3785, %f3781; sqrt.rn.f32 %f3786, %f3782; sqrt.rn.f32 %f3787, %f3783; sqrt.rn.f32 %f3788, %f3784; mov.b32 %r3470, %f3785; mov.b32 %r3471, %f3786; mov.b32 %r3472, %f3787; mov.b32 %r3473, %f3788; cvt.u64.u32 %rd8703, %r3473; cvt.u64.u32 %rd8704, %r3471; cvt.u64.u32 %rd8705, %r3470; cvt.u64.u32 %rd8706, %r3472; bfi.b64 %rd12464, %rd8703, %rd8706, 32, 32; mov.b64 {%r3474, %r3475}, %rd12464; bfi.b64 %rd12463, %rd8704, %rd8705, 32, 32; mov.b64 {%r3476, %r3477}, %rd12463; mov.b32 %f3789, %r3476; mov.b32 %f3790, %r3477; mov.b32 %f3791, %r3474; mov.b32 %f3792, %r3475; setp.lt.f32 %p1743, %f3789, %f903; setp.lt.f32 %p1744, %f3790, %f903; setp.lt.f32 %p1745, %f3791, %f903; setp.lt.f32 %p1746, %f3792, %f903; selp.u32 %r3478, 1, 0, %p1743; selp.u32 %r3479, -1, 0, %p1744; bfi.b32 %r3480, %r3479, %r3478, 8, 1; selp.u32 %r3481, -1, 0, %p1745; bfi.b32 %r3482, %r3481, %r3480, 16, 1; selp.u32 %r3483, -1, 0, %p1746; bfi.b32 %r3484, %r3483, %r3482, 24, 1; cvt.u64.u32 %rd8707, %r3484; mov.b64 {%r3485, %r3486}, %rd8707; mov.b32 {%rs646, %rs647}, %r3485; and.b16 %rs648, %rs646, 1; shr.u16 %rs649, %rs646, 7; and.b16 %rs650, %rs649, 2; or.b16 %rs651, %rs650, %rs648; shl.b16 %rs652, %rs647, 2; and.b16 %rs653, %rs652, 4; or.b16 %rs654, %rs651, %rs653; shr.u16 %rs655, %rs647, 5; and.b16 %rs656, %rs655, 8; or.b16 %rs657, %rs654, %rs656; cvt.u64.u16 %rd3029, %rs657; @%p2939 bra $L__BB1_1260; bra.uni $L__BB1_1259; $L__BB1_1260: mov.u64 %rd8708, 1; st.local.v2.u64 [%rd8], {%rd12358, %rd12357}; st.local.v2.u64 [%rd8+16], {%rd12356, %rd12355}; mov.f32 %f3793, 0f00000000; st.local.v4.f32 [%rd24], {%f3793, %f3793, %f3793, %f3793}; mov.u32 %r3497, 4; st.local.u32 [%rd7+16], %r3497; st.local.u32 [%rd7+52], %r3497; st.local.u32 [%rd7+88], %r3497; st.local.u32 [%rd7+124], %r3497; mov.u64 %rd3033, %rd8708; $L__BB1_1261: add.s64 %rd8712, %rd3033, -1; cvt.u32.u64 %r3498, %rd8712; shl.b64 %rd8714, %rd8708, %r3498; and.b64 %rd8715, %rd8714, %rd3029; setp.eq.s64 %p1747, %rd8715, 0; @%p1747 bra $L__BB1_1314; shl.b64 %rd8716, %rd3033, 3; add.s64 %rd8717, %rd8, %rd8716; ld.local.u64 %rd3034, [%rd8717+-8]; setp.eq.s64 %p1748, %rd3034, 0; @%p1748 bra $L__BB1_1314; ld.u32 %r941, [%rd3034]; cvt.u64.u32 %rd3035, %r941; ld.global.u64 %rd8718, [%rd2884+-160]; setp.gt.u64 %p1749, %rd8718, %rd3035; @%p1749 bra $L__BB1_1265; bra.uni $L__BB1_1264; $L__BB1_1265: ld.global.u64 %rd8719, [%rd2884+-168]; mul.lo.s64 %rd8720, %rd3035, 12; add.s64 %rd3036, %rd8719, %rd8720; ld.u32 %rd3037, [%rd3036+8]; ld.u32 %rd3038, [%rd3036]; ld.global.u64 %rd3039, [%rd2884+-176]; setp.gt.u64 %p1750, %rd3039, %rd3038; @%p1750 bra $L__BB1_1267; bra.uni $L__BB1_1266; $L__BB1_1267: ld.global.u64 %rd3040, [%rd2884+-184]; shl.b64 %rd8721, %rd3038, 3; add.s64 %rd8722, %rd3040, %rd8721; ld.u32 %rd8723, [%rd8722]; ld.u32 %rd8724, [%rd8722+4]; bfi.b64 %rd3041, %rd8724, %rd8723, 32, 32; ld.u32 %rd3042, [%rd3036+4]; setp.gt.u64 %p1751, %rd3039, %rd3042; @%p1751 bra $L__BB1_1269; bra.uni $L__BB1_1268; $L__BB1_1269: setp.gt.u64 %p1752, %rd3039, %rd3037; @%p1752 bra $L__BB1_1271; bra.uni $L__BB1_1270; $L__BB1_1271: shl.b64 %rd8725, %rd3042, 3; add.s64 %rd8726, %rd3040, %rd8725; shl.b64 %rd8727, %rd3037, 3; add.s64 %rd8728, %rd3040, %rd8727; cvt.u32.u64 %r3499, %rd3041; mov.b32 %f904, %r3499; shr.u64 %rd8729, %rd3041, 32; cvt.u32.u64 %r3500, %rd8729; mov.b32 %f905, %r3500; ld.u32 %rd8730, [%rd8726]; ld.u32 %rd8731, [%rd8726+4]; bfi.b64 %rd3043, %rd8731, %rd8730, 32, 32; cvt.u32.u64 %r3501, %rd3043; shr.u64 %rd8732, %rd3043, 32; cvt.u32.u64 %r3502, %rd8732; mov.b32 %f906, %r3501; sub.f32 %f907, %f906, %f904; mov.b32 %f5422, %r3502; sub.f32 %f909, %f5422, %f905; ld.u32 %rd8733, [%rd8728]; ld.u32 %rd8734, [%rd8728+4]; bfi.b64 %rd3044, %rd8734, %rd8733, 32, 32; cvt.u32.u64 %r3503, %rd3044; shr.u64 %rd8735, %rd3044, 32; cvt.u32.u64 %r3504, %rd8735; mov.b32 %f910, %r3503; sub.f32 %f911, %f910, %f904; mov.b32 %f912, %r3504; sub.f32 %f913, %f912, %f905; sub.f32 %f914, %f901, %f904; sub.f32 %f915, %f902, %f905; mul.f32 %f3794, %f909, %f915; fma.rn.f32 %f916, %f907, %f914, %f3794; mul.f32 %f3795, %f913, %f915; fma.rn.f32 %f917, %f911, %f914, %f3795; setp.le.f32 %p1753, %f916, 0f00000000; setp.le.f32 %p1754, %f917, 0f00000000; and.pred %p1755, %p1753, %p1754; @%p1755 bra $L__BB1_1309; bra.uni $L__BB1_1272; $L__BB1_1309: add.u64 %rd12449, %SP, 552; cvta.to.local.u64 %rd12447, %rd12449; add.u64 %rd12455, %SP, 0; cvta.to.local.u64 %rd12453, %rd12455; st.local.u64 [%rd12453], %rd3041; mov.u64 %rd12460, 2; mov.u64 %rd12446, %rd2862; mov.u64 %rd12448, %rd12447; mov.u64 %rd12450, %rd12447; mov.u64 %rd12451, %rd12447; mov.u64 %rd12452, %rd12449; mov.u64 %rd12454, %rd12453; mov.u64 %rd12456, %rd12453; mov.u64 %rd12457, %rd12453; mov.u64 %rd12458, %rd12455; mov.u64 %rd12459, %rd2856; $L__BB1_1310: setp.eq.s64 %p1808, %rd12460, 0; mov.u64 %rd12461, 1; @%p1808 bra $L__BB1_1312; add.s64 %rd12460, %rd12460, -1; add.s64 %rd8880, %rd12447, 8; setp.eq.s64 %p1809, %rd12450, %rd12446; selp.b64 %rd8881, %rd8880, %rd12450, %p1809; add.s64 %rd8882, %rd12448, 8; selp.b64 %rd8883, %rd8882, %rd12451, %p1809; add.s64 %rd8884, %rd12449, 8; selp.b64 %rd8885, %rd8884, %rd12452, %p1809; mov.u64 %rd12461, 0; setp.eq.s64 %p1810, %rd12460, 0; add.s64 %rd8886, %rd8881, 4; add.s64 %rd8887, %rd8883, 4; add.s64 %rd8888, %rd8885, 4; selp.b64 %rd3270, %rd8881, %rd8886, %p1810; selp.b64 %rd12451, %rd8883, %rd8887, %p1810; selp.b64 %rd12452, %rd8885, %rd8888, %p1810; selp.b64 %rd12447, %rd8880, %rd12447, %p1809; selp.b64 %rd12448, %rd8882, %rd12448, %p1809; selp.b64 %rd12449, %rd8884, %rd12449, %p1809; add.s64 %rd8889, %rd12450, 8; selp.b64 %rd12446, %rd8889, %rd12446, %p1809; add.s64 %rd8890, %rd12456, 8; setp.eq.s64 %p1811, %rd12453, %rd12459; selp.b64 %rd8891, %rd8890, %rd12453, %p1811; add.s64 %rd8892, %rd12457, 8; selp.b64 %rd8893, %rd8892, %rd12454, %p1811; add.s64 %rd8894, %rd12458, 8; selp.b64 %rd8895, %rd8894, %rd12455, %p1811; selp.b64 %rd12456, %rd8890, %rd12456, %p1811; selp.b64 %rd12457, %rd8892, %rd12457, %p1811; selp.b64 %rd12458, %rd8894, %rd12458, %p1811; add.s64 %rd8896, %rd12453, 8; selp.b64 %rd12459, %rd8896, %rd12459, %p1811; add.s64 %rd8897, %rd8891, 4; add.s64 %rd8898, %rd8893, 4; add.s64 %rd8899, %rd8895, 4; selp.b64 %rd12453, %rd8891, %rd8897, %p1810; selp.b64 %rd12454, %rd8893, %rd8898, %p1810; selp.b64 %rd12455, %rd8895, %rd8899, %p1810; ld.local.f32 %f3861, [%rd8893]; ld.local.f32 %f3862, [%rd8883]; setp.eq.f32 %p1812, %f3862, %f3861; mov.u64 %rd12450, %rd3270; @%p1812 bra $L__BB1_1310; $L__BB1_1312: mov.u64 %rd11184, 0; or.b64 %rd8901, %rd11184, %rd3041; mov.b64 {%r3546, %r3547}, %rd8901; mov.b64 {%r3548, %r3549}, %rd12461; cvt.u32.u64 %r3551, %rd11184; or.b32 %r5162, %r3551, %r3499; mov.u32 %r5163, 0; mov.b32 %f5426, %r3547; mov.b32 {%rs1032, %rs676}, %r3548; mov.u32 %r5164, %r5163; bra.uni $L__BB1_1313; $L__BB1_1272: sub.f32 %f918, %f901, %f906; sub.f32 %f919, %f902, %f5422; mul.f32 %f3796, %f909, %f919; fma.rn.f32 %f920, %f907, %f918, %f3796; mul.f32 %f3797, %f913, %f919; fma.rn.f32 %f921, %f911, %f918, %f3797; setp.ge.f32 %p1756, %f920, 0f00000000; setp.le.f32 %p1757, %f921, %f920; and.pred %p1758, %p1757, %p1756; @%p1758 bra $L__BB1_1305; bra.uni $L__BB1_1273; $L__BB1_1305: add.u64 %rd12433, %SP, 552; cvta.to.local.u64 %rd12431, %rd12433; add.u64 %rd12439, %SP, 0; cvta.to.local.u64 %rd12437, %rd12439; st.local.u64 [%rd12437], %rd3043; mov.u64 %rd12444, 2; mov.u64 %rd12430, %rd2862; mov.u64 %rd12432, %rd12431; mov.u64 %rd12434, %rd12431; mov.u64 %rd12435, %rd12431; mov.u64 %rd12436, %rd12433; mov.u64 %rd12438, %rd12437; mov.u64 %rd12440, %rd12437; mov.u64 %rd12441, %rd12437; mov.u64 %rd12442, %rd12439; mov.u64 %rd12443, %rd2857; $L__BB1_1306: setp.eq.s64 %p1803, %rd12444, 0; mov.u64 %rd12445, 1; @%p1803 bra $L__BB1_1308; add.s64 %rd12444, %rd12444, -1; add.s64 %rd8853, %rd12431, 8; setp.eq.s64 %p1804, %rd12434, %rd12430; selp.b64 %rd8854, %rd8853, %rd12434, %p1804; add.s64 %rd8855, %rd12432, 8; selp.b64 %rd8856, %rd8855, %rd12435, %p1804; add.s64 %rd8857, %rd12433, 8; selp.b64 %rd8858, %rd8857, %rd12436, %p1804; mov.u64 %rd12445, 0; setp.eq.s64 %p1805, %rd12444, 0; add.s64 %rd8859, %rd8854, 4; add.s64 %rd8860, %rd8856, 4; add.s64 %rd8861, %rd8858, 4; selp.b64 %rd3232, %rd8854, %rd8859, %p1805; selp.b64 %rd12435, %rd8856, %rd8860, %p1805; selp.b64 %rd12436, %rd8858, %rd8861, %p1805; selp.b64 %rd12431, %rd8853, %rd12431, %p1804; selp.b64 %rd12432, %rd8855, %rd12432, %p1804; selp.b64 %rd12433, %rd8857, %rd12433, %p1804; add.s64 %rd8862, %rd12434, 8; selp.b64 %rd12430, %rd8862, %rd12430, %p1804; add.s64 %rd8863, %rd12440, 8; setp.eq.s64 %p1806, %rd12437, %rd12443; selp.b64 %rd8864, %rd8863, %rd12437, %p1806; add.s64 %rd8865, %rd12441, 8; selp.b64 %rd8866, %rd8865, %rd12438, %p1806; add.s64 %rd8867, %rd12442, 8; selp.b64 %rd8868, %rd8867, %rd12439, %p1806; selp.b64 %rd12440, %rd8863, %rd12440, %p1806; selp.b64 %rd12441, %rd8865, %rd12441, %p1806; selp.b64 %rd12442, %rd8867, %rd12442, %p1806; add.s64 %rd8869, %rd12437, 8; selp.b64 %rd12443, %rd8869, %rd12443, %p1806; add.s64 %rd8870, %rd8864, 4; add.s64 %rd8871, %rd8866, 4; add.s64 %rd8872, %rd8868, 4; selp.b64 %rd12437, %rd8864, %rd8870, %p1805; selp.b64 %rd12438, %rd8866, %rd8871, %p1805; selp.b64 %rd12439, %rd8868, %rd8872, %p1805; ld.local.f32 %f3859, [%rd8866]; ld.local.f32 %f3860, [%rd8856]; setp.eq.f32 %p1807, %f3860, %f3859; mov.u64 %rd12434, %rd3232; @%p1807 bra $L__BB1_1306; $L__BB1_1308: mov.u64 %rd11183, 0; or.b64 %rd8874, %rd11183, %rd3043; mov.b64 {%r3538, %r3539}, %rd8874; mov.b64 {%r3540, %r3541}, %rd12445; cvt.u32.u64 %r3543, %rd11183; or.b32 %r5162, %r3543, %r3501; mov.u32 %r5163, 0; mov.b32 %f5426, %r3539; mov.u32 %r5164, 1; mov.b32 {%rs1032, %rs672}, %r3540; bra.uni $L__BB1_1313; $L__BB1_1273: sub.f32 %f922, %f901, %f910; sub.f32 %f923, %f902, %f912; mul.f32 %f3798, %f909, %f923; fma.rn.f32 %f924, %f907, %f922, %f3798; mul.f32 %f3799, %f913, %f923; fma.rn.f32 %f925, %f911, %f922, %f3799; setp.ge.f32 %p1759, %f925, 0f00000000; setp.le.f32 %p1760, %f924, %f925; and.pred %p1761, %p1760, %p1759; @%p1761 bra $L__BB1_1301; bra.uni $L__BB1_1274; $L__BB1_1301: add.u64 %rd12417, %SP, 552; cvta.to.local.u64 %rd12415, %rd12417; add.u64 %rd12423, %SP, 0; cvta.to.local.u64 %rd12421, %rd12423; st.local.u64 [%rd12421], %rd3044; mov.u64 %rd12428, 2; mov.u64 %rd12414, %rd2862; mov.u64 %rd12416, %rd12415; mov.u64 %rd12418, %rd12415; mov.u64 %rd12419, %rd12415; mov.u64 %rd12420, %rd12417; mov.u64 %rd12422, %rd12421; mov.u64 %rd12424, %rd12421; mov.u64 %rd12425, %rd12421; mov.u64 %rd12426, %rd12423; mov.u64 %rd12427, %rd2858; $L__BB1_1302: setp.eq.s64 %p1798, %rd12428, 0; mov.u64 %rd12429, 1; @%p1798 bra $L__BB1_1304; add.s64 %rd12428, %rd12428, -1; add.s64 %rd8826, %rd12415, 8; setp.eq.s64 %p1799, %rd12418, %rd12414; selp.b64 %rd8827, %rd8826, %rd12418, %p1799; add.s64 %rd8828, %rd12416, 8; selp.b64 %rd8829, %rd8828, %rd12419, %p1799; add.s64 %rd8830, %rd12417, 8; selp.b64 %rd8831, %rd8830, %rd12420, %p1799; mov.u64 %rd12429, 0; setp.eq.s64 %p1800, %rd12428, 0; add.s64 %rd8832, %rd8827, 4; add.s64 %rd8833, %rd8829, 4; add.s64 %rd8834, %rd8831, 4; selp.b64 %rd3194, %rd8827, %rd8832, %p1800; selp.b64 %rd12419, %rd8829, %rd8833, %p1800; selp.b64 %rd12420, %rd8831, %rd8834, %p1800; selp.b64 %rd12415, %rd8826, %rd12415, %p1799; selp.b64 %rd12416, %rd8828, %rd12416, %p1799; selp.b64 %rd12417, %rd8830, %rd12417, %p1799; add.s64 %rd8835, %rd12418, 8; selp.b64 %rd12414, %rd8835, %rd12414, %p1799; add.s64 %rd8836, %rd12424, 8; setp.eq.s64 %p1801, %rd12421, %rd12427; selp.b64 %rd8837, %rd8836, %rd12421, %p1801; add.s64 %rd8838, %rd12425, 8; selp.b64 %rd8839, %rd8838, %rd12422, %p1801; add.s64 %rd8840, %rd12426, 8; selp.b64 %rd8841, %rd8840, %rd12423, %p1801; selp.b64 %rd12424, %rd8836, %rd12424, %p1801; selp.b64 %rd12425, %rd8838, %rd12425, %p1801; selp.b64 %rd12426, %rd8840, %rd12426, %p1801; add.s64 %rd8842, %rd12421, 8; selp.b64 %rd12427, %rd8842, %rd12427, %p1801; add.s64 %rd8843, %rd8837, 4; add.s64 %rd8844, %rd8839, 4; add.s64 %rd8845, %rd8841, 4; selp.b64 %rd12421, %rd8837, %rd8843, %p1800; selp.b64 %rd12422, %rd8839, %rd8844, %p1800; selp.b64 %rd12423, %rd8841, %rd8845, %p1800; ld.local.f32 %f3857, [%rd8839]; ld.local.f32 %f3858, [%rd8829]; setp.eq.f32 %p1802, %f3858, %f3857; mov.u64 %rd12418, %rd3194; @%p1802 bra $L__BB1_1302; $L__BB1_1304: mov.u64 %rd11182, 0; or.b64 %rd8847, %rd11182, %rd3044; mov.b64 {%r3530, %r3531}, %rd8847; mov.b64 {%r3532, %r3533}, %rd12429; cvt.u32.u64 %r3535, %rd11182; or.b32 %r5162, %r3535, %r3503; mov.u32 %r5163, 0; mov.b32 %f5426, %r3531; mov.b32 {%rs1032, %rs668}, %r3532; mov.u32 %r5164, 2; bra.uni $L__BB1_1313; $L__BB1_1274: sub.f32 %f926, %f910, %f906; sub.f32 %f927, %f912, %f5422; mul.f32 %f3800, %f909, %f911; mul.f32 %f3801, %f907, %f913; sub.f32 %f928, %f3801, %f3800; mul.f32 %f3802, %f909, %f914; mul.f32 %f3803, %f907, %f915; sub.f32 %f3804, %f3803, %f3802; mul.f32 %f3805, %f928, %f3804; setp.lt.f32 %p1762, %f3805, 0f00000000; setp.ge.f32 %p1763, %f916, 0f00000000; and.pred %p1764, %p1763, %p1762; setp.le.f32 %p1765, %f920, 0f00000000; and.pred %p1766, %p1765, %p1764; mov.u16 %rs1031, 0; @%p1766 bra $L__BB1_1277; mul.f32 %f3806, %f911, %f923; mul.f32 %f3807, %f922, %f913; sub.f32 %f3808, %f3806, %f3807; mul.f32 %f3809, %f928, %f3808; setp.gt.f32 %p1767, %f3809, 0f80000000; setp.ge.f32 %p1768, %f917, 0f00000000; and.pred %p1769, %p1768, %p1767; setp.le.f32 %p1770, %f925, 0f00000000; and.pred %p1771, %p1770, %p1769; mov.u16 %rs1031, 1; @%p1771 bra $L__BB1_1277; mul.f32 %f3810, %f926, %f919; mul.f32 %f3811, %f918, %f927; sub.f32 %f3812, %f3810, %f3811; mul.f32 %f3813, %f928, %f3812; setp.lt.f32 %p1772, %f3813, 0f00000000; sub.f32 %f3814, %f921, %f920; setp.ge.f32 %p1773, %f3814, 0f00000000; and.pred %p1774, %p1773, %p1772; sub.f32 %f3815, %f924, %f925; setp.ge.f32 %p1775, %f3815, 0f00000000; and.pred %p1776, %p1775, %p1774; selp.b16 %rs1031, 2, 3, %p1776; $L__BB1_1277: mul.f32 %f3816, %f909, %f909; fma.rn.f32 %f3817, %f907, %f907, %f3816; add.f32 %f929, %f3817, 0f00000000; mul.f32 %f3818, %f913, %f913; fma.rn.f32 %f3819, %f911, %f911, %f3818; add.f32 %f930, %f3819, 0f00000000; mul.f32 %f3820, %f927, %f927; fma.rn.f32 %f3821, %f926, %f926, %f3820; add.f32 %f931, %f3821, 0f00000000; setp.eq.s16 %p1777, %rs1031, 1; @%p1777 bra $L__BB1_1292; setp.eq.s16 %p1778, %rs1031, 2; @%p1778 bra $L__BB1_1288; setp.ne.s16 %p1779, %rs1031, 3; @%p1779 bra $L__BB1_1296; sub.f32 %f3822, %f916, %f920; div.rn.f32 %f932, %f916, %f3822; sub.f32 %f3823, %f917, %f925; div.rn.f32 %f933, %f917, %f3823; sub.f32 %f3824, %f921, %f920; add.f32 %f3825, %f924, %f3824; sub.f32 %f3826, %f3825, %f925; div.rn.f32 %f5424, %f3824, %f3826; mul.f32 %f3827, %f915, %f915; fma.rn.f32 %f3828, %f914, %f914, %f3827; add.f32 %f3829, %f3828, 0f00000000; mul.f32 %f3830, %f929, %f932; mul.f32 %f3831, %f932, %f3830; sub.f32 %f935, %f3829, %f3831; mul.f32 %f3832, %f930, %f5424; mul.f32 %f3833, %f5424, %f3832; sub.f32 %f936, %f3829, %f3833; mul.f32 %f3834, %f919, %f919; fma.rn.f32 %f3835, %f918, %f918, %f3834; add.f32 %f3836, %f3835, 0f00000000; mul.f32 %f3837, %f931, %f933; mul.f32 %f3838, %f933, %f3837; sub.f32 %f937, %f3836, %f3838; setp.lt.f32 %p1780, %f935, %f936; @%p1780 bra $L__BB1_1284; bra.uni $L__BB1_1281; $L__BB1_1284: setp.lt.f32 %p1782, %f935, %f937; @%p1782 bra $L__BB1_1286; bra.uni $L__BB1_1285; $L__BB1_1286: mul.f32 %f5423, %f909, %f932; fma.rn.f32 %f5421, %f907, %f932, %f904; mov.u32 %r5164, 0; mov.f32 %f5422, %f905; mov.f32 %f5424, %f932; bra.uni $L__BB1_1287; $L__BB1_1288: add.u64 %rd12367, %SP, 552; cvta.to.local.u64 %rd12365, %rd12367; add.u64 %rd12373, %SP, 0; cvta.to.local.u64 %rd12371, %rd12373; mul.f32 %f3841, %f927, %f919; fma.rn.f32 %f3842, %f926, %f918, %f3841; div.rn.f32 %f5425, %f3842, %f931; fma.rn.f32 %f3843, %f926, %f5425, %f906; mov.b32 %r3512, %f3843; fma.rn.f32 %f3844, %f927, %f5425, %f5422; mov.b32 %r3513, %f3844; cvt.u64.u32 %rd8739, %r3513; cvt.u64.u32 %rd8740, %r3512; bfi.b64 %rd3052, %rd8739, %rd8740, 32, 32; st.local.u64 [%rd12371], %rd3052; mov.u64 %rd12378, 2; mov.u64 %rd12364, %rd2862; mov.u64 %rd12366, %rd12365; mov.u64 %rd12368, %rd12365; mov.u64 %rd12369, %rd12365; mov.u64 %rd12370, %rd12367; mov.u64 %rd12372, %rd12371; mov.u64 %rd12374, %rd12371; mov.u64 %rd12375, %rd12371; mov.u64 %rd12376, %rd12373; mov.u64 %rd12377, %rd2861; $L__BB1_1289: setp.eq.s64 %p1783, %rd12378, 0; mov.u64 %rd12413, 1; @%p1783 bra $L__BB1_1291; add.s64 %rd12378, %rd12378, -1; add.s64 %rd8745, %rd12365, 8; setp.eq.s64 %p1784, %rd12368, %rd12364; selp.b64 %rd8746, %rd8745, %rd12368, %p1784; add.s64 %rd8747, %rd12366, 8; selp.b64 %rd8748, %rd8747, %rd12369, %p1784; add.s64 %rd8749, %rd12367, 8; selp.b64 %rd8750, %rd8749, %rd12370, %p1784; mov.u64 %rd12413, 0; setp.eq.s64 %p1785, %rd12378, 0; add.s64 %rd8751, %rd8746, 4; add.s64 %rd8752, %rd8748, 4; add.s64 %rd8753, %rd8750, 4; selp.b64 %rd3069, %rd8746, %rd8751, %p1785; selp.b64 %rd12369, %rd8748, %rd8752, %p1785; selp.b64 %rd12370, %rd8750, %rd8753, %p1785; selp.b64 %rd12365, %rd8745, %rd12365, %p1784; selp.b64 %rd12366, %rd8747, %rd12366, %p1784; selp.b64 %rd12367, %rd8749, %rd12367, %p1784; add.s64 %rd8754, %rd12368, 8; selp.b64 %rd12364, %rd8754, %rd12364, %p1784; add.s64 %rd8755, %rd12374, 8; setp.eq.s64 %p1786, %rd12371, %rd12377; selp.b64 %rd8756, %rd8755, %rd12371, %p1786; add.s64 %rd8757, %rd12375, 8; selp.b64 %rd8758, %rd8757, %rd12372, %p1786; add.s64 %rd8759, %rd12376, 8; selp.b64 %rd8760, %rd8759, %rd12373, %p1786; selp.b64 %rd12374, %rd8755, %rd12374, %p1786; selp.b64 %rd12375, %rd8757, %rd12375, %p1786; selp.b64 %rd12376, %rd8759, %rd12376, %p1786; add.s64 %rd8761, %rd12371, 8; selp.b64 %rd12377, %rd8761, %rd12377, %p1786; add.s64 %rd8762, %rd8756, 4; add.s64 %rd8763, %rd8758, 4; add.s64 %rd8764, %rd8760, 4; selp.b64 %rd12371, %rd8756, %rd8762, %p1785; selp.b64 %rd12372, %rd8758, %rd8763, %p1785; selp.b64 %rd12373, %rd8760, %rd8764, %p1785; ld.local.f32 %f3845, [%rd8758]; ld.local.f32 %f3846, [%rd8748]; setp.eq.f32 %p1787, %f3846, %f3845; mov.u64 %rd12368, %rd3069; @%p1787 bra $L__BB1_1289; $L__BB1_1291: mov.u64 %rd11179, 0; or.b64 %rd12412, %rd11179, %rd3052; mov.u32 %r5164, 1; bra.uni $L__BB1_1300; $L__BB1_1292: add.u64 %rd12383, %SP, 552; cvta.to.local.u64 %rd12381, %rd12383; add.u64 %rd12389, %SP, 0; cvta.to.local.u64 %rd12387, %rd12389; div.rn.f32 %f5425, %f917, %f930; fma.rn.f32 %f3847, %f911, %f5425, %f904; mov.b32 %r3515, %f3847; fma.rn.f32 %f3848, %f913, %f5425, %f905; mov.b32 %r3516, %f3848; cvt.u64.u32 %rd8766, %r3516; cvt.u64.u32 %rd8767, %r3515; bfi.b64 %rd3093, %rd8766, %rd8767, 32, 32; st.local.u64 [%rd12387], %rd3093; mov.u64 %rd12394, 2; mov.u64 %rd12380, %rd2862; mov.u64 %rd12382, %rd12381; mov.u64 %rd12384, %rd12381; mov.u64 %rd12385, %rd12381; mov.u64 %rd12386, %rd12383; mov.u64 %rd12388, %rd12387; mov.u64 %rd12390, %rd12387; mov.u64 %rd12391, %rd12387; mov.u64 %rd12392, %rd12389; mov.u64 %rd12393, %rd2860; $L__BB1_1293: setp.eq.s64 %p1788, %rd12394, 0; mov.u64 %rd12413, 1; @%p1788 bra $L__BB1_1295; add.s64 %rd12394, %rd12394, -1; add.s64 %rd8772, %rd12381, 8; setp.eq.s64 %p1789, %rd12384, %rd12380; selp.b64 %rd8773, %rd8772, %rd12384, %p1789; add.s64 %rd8774, %rd12382, 8; selp.b64 %rd8775, %rd8774, %rd12385, %p1789; add.s64 %rd8776, %rd12383, 8; selp.b64 %rd8777, %rd8776, %rd12386, %p1789; mov.u64 %rd12413, 0; setp.eq.s64 %p1790, %rd12394, 0; add.s64 %rd8778, %rd8773, 4; add.s64 %rd8779, %rd8775, 4; add.s64 %rd8780, %rd8777, 4; selp.b64 %rd3110, %rd8773, %rd8778, %p1790; selp.b64 %rd12385, %rd8775, %rd8779, %p1790; selp.b64 %rd12386, %rd8777, %rd8780, %p1790; selp.b64 %rd12381, %rd8772, %rd12381, %p1789; selp.b64 %rd12382, %rd8774, %rd12382, %p1789; selp.b64 %rd12383, %rd8776, %rd12383, %p1789; add.s64 %rd8781, %rd12384, 8; selp.b64 %rd12380, %rd8781, %rd12380, %p1789; add.s64 %rd8782, %rd12390, 8; setp.eq.s64 %p1791, %rd12387, %rd12393; selp.b64 %rd8783, %rd8782, %rd12387, %p1791; add.s64 %rd8784, %rd12391, 8; selp.b64 %rd8785, %rd8784, %rd12388, %p1791; add.s64 %rd8786, %rd12392, 8; selp.b64 %rd8787, %rd8786, %rd12389, %p1791; selp.b64 %rd12390, %rd8782, %rd12390, %p1791; selp.b64 %rd12391, %rd8784, %rd12391, %p1791; selp.b64 %rd12392, %rd8786, %rd12392, %p1791; add.s64 %rd8788, %rd12387, 8; selp.b64 %rd12393, %rd8788, %rd12393, %p1791; add.s64 %rd8789, %rd8783, 4; add.s64 %rd8790, %rd8785, 4; add.s64 %rd8791, %rd8787, 4; selp.b64 %rd12387, %rd8783, %rd8789, %p1790; selp.b64 %rd12388, %rd8785, %rd8790, %p1790; selp.b64 %rd12389, %rd8787, %rd8791, %p1790; ld.local.f32 %f3849, [%rd8785]; ld.local.f32 %f3850, [%rd8775]; setp.eq.f32 %p1792, %f3850, %f3849; mov.u64 %rd12384, %rd3110; @%p1792 bra $L__BB1_1293; $L__BB1_1295: mov.u64 %rd11180, 0; or.b64 %rd12412, %rd11180, %rd3093; mov.u32 %r5164, 2; bra.uni $L__BB1_1300; $L__BB1_1296: div.rn.f32 %f5425, %f916, %f929; fma.rn.f32 %f3851, %f907, %f5425, %f904; mov.b32 %r3518, %f3851; fma.rn.f32 %f3852, %f909, %f5425, %f905; mov.b32 %r3519, %f3852; cvt.u64.u32 %rd8793, %r3519; cvt.u64.u32 %rd8794, %r3518; bfi.b64 %rd3134, %rd8793, %rd8794, 32, 32; st.local.u64 [%rd8488], %rd3134; mov.u64 %rd12410, 2; mov.u64 %rd12396, %rd2862; mov.u64 %rd12397, %rd8500; mov.u64 %rd12398, %rd8500; mov.u64 %rd12399, %rd8499; mov.u64 %rd12400, %rd8500; mov.u64 %rd12401, %rd8500; mov.u64 %rd12402, %rd8499; mov.u64 %rd12403, %rd8488; mov.u64 %rd12404, %rd8488; mov.u64 %rd12405, %rd8487; mov.u64 %rd12406, %rd8488; mov.u64 %rd12407, %rd8488; mov.u64 %rd12408, %rd8487; mov.u64 %rd12409, %rd2859; $L__BB1_1297: setp.eq.s64 %p1793, %rd12410, 0; mov.u64 %rd12413, 1; @%p1793 bra $L__BB1_1299; add.s64 %rd12410, %rd12410, -1; add.s64 %rd8799, %rd12397, 8; setp.eq.s64 %p1794, %rd12400, %rd12396; selp.b64 %rd8800, %rd8799, %rd12400, %p1794; add.s64 %rd8801, %rd12398, 8; selp.b64 %rd8802, %rd8801, %rd12401, %p1794; add.s64 %rd8803, %rd12399, 8; selp.b64 %rd8804, %rd8803, %rd12402, %p1794; mov.u64 %rd12413, 0; setp.eq.s64 %p1795, %rd12410, 0; add.s64 %rd8805, %rd8800, 4; add.s64 %rd8806, %rd8802, 4; add.s64 %rd8807, %rd8804, 4; selp.b64 %rd3151, %rd8800, %rd8805, %p1795; selp.b64 %rd12401, %rd8802, %rd8806, %p1795; selp.b64 %rd12402, %rd8804, %rd8807, %p1795; selp.b64 %rd12397, %rd8799, %rd12397, %p1794; selp.b64 %rd12398, %rd8801, %rd12398, %p1794; selp.b64 %rd12399, %rd8803, %rd12399, %p1794; add.s64 %rd8808, %rd12400, 8; selp.b64 %rd12396, %rd8808, %rd12396, %p1794; add.s64 %rd8809, %rd12406, 8; setp.eq.s64 %p1796, %rd12403, %rd12409; selp.b64 %rd8810, %rd8809, %rd12403, %p1796; add.s64 %rd8811, %rd12407, 8; selp.b64 %rd8812, %rd8811, %rd12404, %p1796; add.s64 %rd8813, %rd12408, 8; selp.b64 %rd8814, %rd8813, %rd12405, %p1796; selp.b64 %rd12406, %rd8809, %rd12406, %p1796; selp.b64 %rd12407, %rd8811, %rd12407, %p1796; selp.b64 %rd12408, %rd8813, %rd12408, %p1796; add.s64 %rd8815, %rd12403, 8; selp.b64 %rd12409, %rd8815, %rd12409, %p1796; add.s64 %rd8816, %rd8810, 4; add.s64 %rd8817, %rd8812, 4; add.s64 %rd8818, %rd8814, 4; selp.b64 %rd12403, %rd8810, %rd8816, %p1795; selp.b64 %rd12404, %rd8812, %rd8817, %p1795; selp.b64 %rd12405, %rd8814, %rd8818, %p1795; ld.local.f32 %f3853, [%rd8812]; ld.local.f32 %f3854, [%rd8802]; setp.eq.f32 %p1797, %f3854, %f3853; mov.u64 %rd12400, %rd3151; @%p1797 bra $L__BB1_1297; $L__BB1_1299: mov.u64 %rd11181, 0; or.b64 %rd12412, %rd11181, %rd3134; mov.u32 %r5164, 0; $L__BB1_1300: mov.f32 %f3855, 0f3F800000; sub.f32 %f3856, %f3855, %f5425; mov.b32 %r3522, %f3856; mov.b32 %r3523, %f5425; cvt.u64.u32 %rd8819, %r3523; cvt.u64.u32 %rd8820, %r3522; bfi.b64 %rd12462, %rd8819, %rd8820, 32, 32; mov.b64 {%r3524, %r3525}, %rd12413; mov.b64 {%r3526, %r3527}, %rd12412; cvt.u32.u64 %r5162, %rd12412; mov.b32 %f5426, %r3527; mov.u32 %r5163, 1; mov.b32 {%rs1032, %rs664}, %r3524; bra.uni $L__BB1_1313; $L__BB1_1281: setp.lt.f32 %p1781, %f936, %f937; @%p1781 bra $L__BB1_1283; bra.uni $L__BB1_1282; $L__BB1_1283: mul.f32 %f5423, %f913, %f933; fma.rn.f32 %f5421, %f911, %f933, %f904; mov.u32 %r5164, 2; mov.f32 %f5422, %f905; mov.f32 %f5424, %f933; bra.uni $L__BB1_1287; $L__BB1_1285: mul.f32 %f5423, %f927, %f5424; fma.rn.f32 %f5421, %f926, %f5424, %f906; mov.u32 %r5164, 1; bra.uni $L__BB1_1287; $L__BB1_1282: mul.f32 %f5423, %f927, %f5424; fma.rn.f32 %f5421, %f926, %f5424, %f906; mov.u32 %r5164, 1; $L__BB1_1287: add.f32 %f5426, %f5422, %f5423; mov.f32 %f3839, 0f3F800000; sub.f32 %f3840, %f3839, %f5424; mov.b32 %r3510, %f3840; mov.b32 %r3511, %f5424; cvt.u64.u32 %rd8736, %r3511; cvt.u64.u32 %rd8737, %r3510; bfi.b64 %rd12462, %rd8736, %rd8737, 32, 32; mov.b32 %r5162, %f5421; mov.u32 %r5163, 1; mov.u16 %rs1032, 1; $L__BB1_1313: mov.b32 %f3863, %r5162; sub.f32 %f3864, %f3863, %f901; sub.f32 %f3865, %f5426, %f902; mul.f32 %f3866, %f3865, %f3865; fma.rn.f32 %f3867, %f3864, %f3864, %f3866; add.f32 %f3868, %f3867, 0f00000000; sqrt.rn.f32 %f3869, %f3868; shl.b64 %rd8904, %rd3033, 2; add.s64 %rd8905, %rd24, %rd8904; st.local.f32 [%rd8905+-4], %f3869; mul.lo.s64 %rd8906, %rd3033, 36; add.s64 %rd8907, %rd7, %rd8906; st.local.u32 [%rd8907+-36], %r5162; st.local.f32 [%rd8907+-32], %f5426; mov.u16 %rs677, 0; st.local.v4.u8 [%rd8907+-28], {%rs1032, %rs677, %rs677, %rs677}; st.local.u32 [%rd8907+-24], %r941; st.local.u32 [%rd8907+-20], %r5163; st.local.u32 [%rd8907+-16], %r5164; shr.u64 %rd8908, %rd12462, 32; st.local.u32 [%rd8907+-8], %rd8908; st.local.u32 [%rd8907+-12], %rd12462; $L__BB1_1314: setp.lt.u64 %p1813, %rd3033, 4; add.s64 %rd3033, %rd3033, 1; @%p1813 bra $L__BB1_1261; ld.local.v2.u64 {%rd12463, %rd12464}, [%rd24]; ld.local.v4.u32 {%r5174, %r5175, %r5176, %r3555}, [%rd7]; ld.local.u32 %r5177, [%rd7+16]; ld.local.u32 %rd8911, [%rd2873+4]; ld.local.u32 %rd8912, [%rd2873+8]; bfi.b64 %rd8913, %rd8912, %rd8911, 32, 32; mov.b64 {%r5171, %r5172}, %rd8913; ld.local.u32 %r5173, [%rd2873+12]; ld.local.u32 %r5178, [%rd2874+4]; ld.local.u32 %r5170, [%rd2875+16]; ld.local.u64 %rd8914, [%rd2875+8]; mov.b64 {%r5168, %r5169}, %rd8914; ld.local.u32 %r5179, [%rd2876+8]; ld.local.u32 %rd8915, [%rd2877+12]; ld.local.u32 %rd8916, [%rd2877+16]; bfi.b64 %rd8917, %rd8916, %rd8915, 32, 32; mov.b64 {%r5165, %r5166}, %rd8917; ld.local.u32 %r5167, [%rd2877+20]; ld.local.u32 %r5180, [%rd2878+12]; bra.uni $L__BB1_1316; $L__BB1_1259: mov.u32 %r5177, 4; mov.u32 %r5178, %r5177; mov.u32 %r5179, %r5177; mov.u32 %r5180, %r5177; $L__BB1_1316: and.b64 %rd8918, %rd3029, 1; setp.eq.b64 %p1814, %rd8918, 1; mov.pred %p1815, 0; xor.pred %p1816, %p1814, %p1815; not.pred %p1817, %p1816; mov.b64 {%r984, %r985}, %rd12463; mov.b32 %f960, %r984; mov.b32 %f961, %r985; mov.b64 {%r986, %r987}, %rd12464; mov.b32 %f962, %r986; mov.b32 %f963, %r987; @%p1817 bra $L__BB1_1325; bra.uni $L__BB1_1317; $L__BB1_1325: and.b64 %rd8936, %rd3029, 2; setp.eq.s64 %p1831, %rd8936, 0; @%p1831 bra $L__BB1_1334; bra.uni $L__BB1_1326; $L__BB1_1334: and.b64 %rd8954, %rd3029, 4; setp.eq.s64 %p1845, %rd8954, 0; @%p1845 bra $L__BB1_1343; bra.uni $L__BB1_1335; $L__BB1_1343: and.b64 %rd8972, %rd3029, 8; setp.eq.s64 %p1859, %rd8972, 0; @%p1859 bra $L__BB1_1251; ld.u8 %rs684, [%rd3018+88]; and.b16 %rs685, %rs684, 1; setp.eq.b16 %p1860, %rs685, 1; mov.pred %p1861, 0; xor.pred %p1862, %p1860, %p1861; not.pred %p1863, %p1862; @%p1863 bra $L__BB1_1347; bra.uni $L__BB1_1345; $L__BB1_1347: ld.u32 %r1035, [%rd3018+76]; cvt.u64.u32 %rd8976, %r1035; setp.le.u64 %p1870, %rd3006, %rd8976; @%p1870 bra $L__BB1_1251; neg.f32 %f967, %f963; setp.lt.u32 %p1871, %r940, 64; @%p1871 bra $L__BB1_1350; bra.uni $L__BB1_1349; $L__BB1_1350: mul.wide.u32 %rd8988, %r940, 8; add.s64 %rd8989, %rd3007, %rd8988; mov.u64 %rd12471, 0; st.local.u32 [%rd8989], %r1035; st.local.f32 [%rd8989+4], %f967; add.s32 %r940, %r940, 1; st.local.u32 [%rd3007+512], %r940; mov.u64 %rd12472, %rd12471; bra.uni $L__BB1_1351; $L__BB1_1317: ld.u8 %rs678, [%rd3018+88]; and.b16 %rs679, %rs678, 1; setp.eq.b16 %p1818, %rs679, 1; xor.pred %p1820, %p1818, %p1815; not.pred %p1821, %p1820; @%p1821 bra $L__BB1_1320; bra.uni $L__BB1_1318; $L__BB1_1320: ld.u32 %r993, [%rd3018+64]; cvt.u64.u32 %rd8922, %r993; setp.le.u64 %p1828, %rd3006, %rd8922; @%p1828 bra $L__BB1_1325; neg.f32 %f964, %f960; setp.lt.u32 %p1829, %r940, 64; @%p1829 bra $L__BB1_1323; bra.uni $L__BB1_1322; $L__BB1_1323: add.s32 %r3558, %r939, -1; mul.wide.u32 %rd8934, %r3558, 8; add.s64 %rd8935, %rd3007, %rd8934; mov.u64 %rd12465, 0; st.local.u32 [%rd8935], %r993; st.local.f32 [%rd8935+4], %f964; add.s32 %r940, %r940, 1; st.local.u32 [%rd3007+512], %r940; mov.u64 %rd12466, %rd12465; bra.uni $L__BB1_1324; $L__BB1_1326: ld.u8 %rs680, [%rd3018+88]; and.b16 %rs681, %rs680, 1; setp.eq.b16 %p1832, %rs681, 1; mov.pred %p1833, 0; xor.pred %p1834, %p1832, %p1833; not.pred %p1835, %p1834; @%p1835 bra $L__BB1_1329; bra.uni $L__BB1_1327; $L__BB1_1329: ld.u32 %r1007, [%rd3018+68]; cvt.u64.u32 %rd8940, %r1007; setp.le.u64 %p1842, %rd3006, %rd8940; @%p1842 bra $L__BB1_1334; neg.f32 %f965, %f961; setp.lt.u32 %p1843, %r940, 64; @%p1843 bra $L__BB1_1332; bra.uni $L__BB1_1331; $L__BB1_1332: mul.wide.u32 %rd8952, %r940, 8; add.s64 %rd8953, %rd3007, %rd8952; mov.u64 %rd12467, 0; st.local.u32 [%rd8953], %r1007; st.local.f32 [%rd8953+4], %f965; add.s32 %r940, %r940, 1; st.local.u32 [%rd3007+512], %r940; mov.u64 %rd12468, %rd12467; bra.uni $L__BB1_1333; $L__BB1_1335: ld.u8 %rs682, [%rd3018+88]; and.b16 %rs683, %rs682, 1; setp.eq.b16 %p1846, %rs683, 1; mov.pred %p1847, 0; xor.pred %p1848, %p1846, %p1847; not.pred %p1849, %p1848; @%p1849 bra $L__BB1_1338; bra.uni $L__BB1_1336; $L__BB1_1338: ld.u32 %r1021, [%rd3018+72]; cvt.u64.u32 %rd8958, %r1021; setp.le.u64 %p1856, %rd3006, %rd8958; @%p1856 bra $L__BB1_1343; neg.f32 %f966, %f962; setp.lt.u32 %p1857, %r940, 64; @%p1857 bra $L__BB1_1341; bra.uni $L__BB1_1340; $L__BB1_1341: mul.wide.u32 %rd8970, %r940, 8; add.s64 %rd8971, %rd3007, %rd8970; mov.u64 %rd12469, 0; st.local.u32 [%rd8971], %r1021; st.local.f32 [%rd8971+4], %f966; add.s32 %r940, %r940, 1; st.local.u32 [%rd3007+512], %r940; mov.u64 %rd12470, %rd12469; bra.uni $L__BB1_1342; $L__BB1_1345: mov.b32 %f3872, %r938; setp.leu.f32 %p1864, %f3872, %f963; setp.eq.s32 %p1865, %r5180, 4; or.pred %p1866, %p1865, %p1864; @%p1866 bra $L__BB1_1251; bra.uni $L__BB1_1346; $L__BB1_1318: setp.leu.f32 %p1822, %f903, %f960; setp.eq.s32 %p1823, %r5177, 4; or.pred %p1824, %p1823, %p1822; @%p1824 bra $L__BB1_1325; ld.u32 %r3556, [%rd3018+64]; cvt.u64.u32 %rd8919, %r3556; setp.le.u64 %p1825, %rd3009, %rd8919; mul.wide.u32 %rd8920, %r3556, 12; add.s64 %rd8921, %rd3010, %rd8920; setp.eq.s64 %p1826, %rd8921, 0; or.pred %p1827, %p1825, %p1826; selp.b32 %r935, %r935, %r5176, %p1827; selp.b32 %r934, %r934, %r5175, %p1827; selp.b32 %r933, %r933, %r5174, %p1827; selp.b32 %r937, %r937, %r5177, %p1827; selp.b32 %r938, %r938, %r984, %p1827; bra.uni $L__BB1_1325; $L__BB1_1327: mov.b32 %f3870, %r938; setp.leu.f32 %p1836, %f3870, %f961; setp.eq.s32 %p1837, %r5178, 4; or.pred %p1838, %p1837, %p1836; @%p1838 bra $L__BB1_1334; ld.u32 %r3564, [%rd3018+68]; cvt.u64.u32 %rd8937, %r3564; setp.le.u64 %p1839, %rd3009, %rd8937; mul.wide.u32 %rd8938, %r3564, 12; add.s64 %rd8939, %rd3010, %rd8938; setp.eq.s64 %p1840, %rd8939, 0; or.pred %p1841, %p1839, %p1840; selp.b32 %r935, %r935, %r5173, %p1841; selp.b32 %r934, %r934, %r5172, %p1841; selp.b32 %r933, %r933, %r5171, %p1841; selp.b32 %r937, %r937, %r5178, %p1841; selp.b32 %r938, %r938, %r985, %p1841; bra.uni $L__BB1_1334; $L__BB1_1336: mov.b32 %f3871, %r938; setp.leu.f32 %p1850, %f3871, %f962; setp.eq.s32 %p1851, %r5179, 4; or.pred %p1852, %p1851, %p1850; @%p1852 bra $L__BB1_1343; ld.u32 %r3571, [%rd3018+72]; cvt.u64.u32 %rd8955, %r3571; setp.le.u64 %p1853, %rd3009, %rd8955; mul.wide.u32 %rd8956, %r3571, 12; add.s64 %rd8957, %rd3010, %rd8956; setp.eq.s64 %p1854, %rd8957, 0; or.pred %p1855, %p1853, %p1854; selp.b32 %r935, %r935, %r5170, %p1855; selp.b32 %r934, %r934, %r5169, %p1855; selp.b32 %r933, %r933, %r5168, %p1855; selp.b32 %r937, %r937, %r5179, %p1855; selp.b32 %r938, %r938, %r986, %p1855; bra.uni $L__BB1_1343; $L__BB1_1349: mov.u64 %rd12472, 1; shl.b64 %rd12471, %rd8976, 32; $L__BB1_1351: mov.u64 %rd11194, 0; cvt.u32.u64 %r3580, %rd11194; cvt.u32.u64 %r3581, %rd12471; or.b32 %r3582, %r3581, %r3580; cvt.u32.u64 %r3583, %rd12472; or.b32 %r3584, %r3582, %r3583; setp.eq.s32 %p1872, %r3584, 0; @%p1872 bra $L__BB1_1251; bra.uni $L__BB1_1352; $L__BB1_1322: mov.u64 %rd12466, 1; shl.b64 %rd12465, %rd8922, 32; $L__BB1_1324: mov.u64 %rd11185, 0; cvt.u32.u64 %r3559, %rd11185; cvt.u32.u64 %r3560, %rd12465; or.b32 %r3561, %r3560, %r3559; cvt.u32.u64 %r3562, %rd12466; or.b32 %r3563, %r3561, %r3562; setp.ne.s32 %p1830, %r3563, 0; @%p1830 bra $L__BB1_1352; bra.uni $L__BB1_1325; $L__BB1_1331: mov.u64 %rd12468, 1; shl.b64 %rd12467, %rd8940, 32; $L__BB1_1333: mov.u64 %rd11188, 0; cvt.u32.u64 %r3566, %rd11188; cvt.u32.u64 %r3567, %rd12467; or.b32 %r3568, %r3567, %r3566; cvt.u32.u64 %r3569, %rd12468; or.b32 %r3570, %r3568, %r3569; setp.ne.s32 %p1844, %r3570, 0; @%p1844 bra $L__BB1_1352; bra.uni $L__BB1_1334; $L__BB1_1340: mov.u64 %rd12470, 1; shl.b64 %rd12469, %rd8958, 32; $L__BB1_1342: mov.u64 %rd11191, 0; cvt.u32.u64 %r3573, %rd11191; cvt.u32.u64 %r3574, %rd12469; or.b32 %r3575, %r3574, %r3573; cvt.u32.u64 %r3576, %rd12470; or.b32 %r3577, %r3575, %r3576; setp.ne.s32 %p1858, %r3577, 0; @%p1858 bra $L__BB1_1352; bra.uni $L__BB1_1343; $L__BB1_1353: setp.eq.s32 %p1873, %r937, 4; mov.u64 %rd12473, %rd8644; mov.u64 %rd12474, %rd8642; mov.u64 %rd12475, %rd8644; @%p1873 bra $L__BB1_1355; mov.b64 %rd12475, {%r933, %r934}; mov.b32 {%rs686, %rs687}, %r935; mov.b64 %rd8996, {%r935, %r3585}; and.b64 %rd12473, %rd8996, 4294967040; cvt.u64.u16 %rd8997, %rs686; and.b64 %rd12474, %rd8997, 255; $L__BB1_1355: or.b64 %rd9004, %rd12474, %rd12473; or.b64 %rd9005, %rd9004, %rd8644; mov.b64 {%r3586, %r3587}, %rd9005; mov.b32 {%rs132, %rs688}, %r3586; and.b16 %rs689, %rs132, 255; setp.eq.s16 %p1874, %rs689, 2; @%p1874 bra $L__BB1_1357; cvt.u32.u64 %r3588, %rd12475; mov.b32 %f3873, %r3588; shr.u64 %rd9006, %rd12475, 32; cvt.u32.u64 %r3589, %rd9006; mov.b32 %f3874, %r3589; ld.global.f32 %f3875, [%rd2884+-24]; mul.f32 %f3876, %f3875, %f3873; ld.global.f32 %f3877, [%rd2884+-20]; mul.f32 %f3878, %f3877, %f3874; sub.f32 %f3879, %f3876, %f3878; mul.f32 %f3880, %f3877, %f3873; fma.rn.f32 %f3881, %f3875, %f3874, %f3880; ld.global.f32 %f3882, [%rd2884+-16]; add.f32 %f3883, %f3882, %f3879; mov.b32 %r3590, %f3883; ld.global.f32 %f3884, [%rd2884+-12]; add.f32 %f3885, %f3884, %f3881; mov.b32 %r3591, %f3885; cvt.u64.u32 %rd9007, %r3591; cvt.u64.u32 %rd9008, %r3590; cvt.u64.u16 %rd9009, %rs132; bfi.b64 %rd8644, %rd9007, %rd9008, 32, 32; and.b64 %rd9010, %rd9009, 255; mov.b64 {%r3592, %r3593}, %rd9010; mov.b32 {%rs690, %rs691}, %r3592; cvt.u64.u16 %rd8642, %rs690; $L__BB1_1357: mov.u64 %rd11203, 0; or.b64 %rd9017, %rd11203, %rd8642; or.b64 %rd3357, %rd9017, %rd11203; mov.b64 {%r3594, %r3595}, %rd3357; mov.b32 {%rs133, %rs692}, %r3594; and.b16 %rs693, %rs133, 255; setp.eq.s16 %p1875, %rs693, 2; mov.u64 %rd12478, 2; mov.u64 %rd12479, %rd11203; mov.u64 %rd12480, %rd11203; @%p1875 bra $L__BB1_1359; and.b64 %rd9019, %rd3357, 4294967040; cvt.u64.u16 %rd9020, %rs133; and.b64 %rd9021, %rd9020, 255; or.b64 %rd9022, %rd9021, %rd11203; or.b64 %rd9023, %rd9022, %rd9019; mov.b64 {%r3596, %r3597}, %rd9023; mov.b32 {%rs694, %rs695}, %r3596; not.b16 %rs696, %rs694; ld.global.u8 %rs697, [%rd2884+-32]; setp.eq.s16 %p1876, %rs697, 0; and.b16 %rs698, %rs696, 1; selp.b16 %rs699, %rs694, %rs698, %p1876; and.b64 %rd9024, %rd9023, 4294967040; cvt.u64.u16 %rd9025, %rs699; and.b64 %rd9026, %rd9025, 255; or.b64 %rd9027, %rd9024, %rd11203; or.b64 %rd9028, %rd9027, %rd9026; mov.b64 {%r3598, %r3599}, %rd9028; mov.b32 {%rs700, %rs701}, %r3598; and.b64 %rd12480, %rd9028, 4294967040; cvt.u64.u16 %rd9029, %rs700; and.b64 %rd12478, %rd9029, 255; mov.u64 %rd12479, %rd8644; $L__BB1_1359: or.b64 %rd9030, %rd12479, %rd11203; or.b64 %rd9031, %rd11203, %rd12478; or.b64 %rd9032, %rd9031, %rd12480; or.b64 %rd9033, %rd9030, %rd11203; mov.b64 {%r5211, %r5212}, %rd9033; mov.b64 {%r5213, %r3600}, %rd9032; bra.uni $L__BB1_1416; $L__BB1_1219: cvt.u32.u64 %r3393, %rd2886; cvt.u32.u64 %r3394, %rd2905; rem.u32 %r3395, %r3394, %r3393; cvt.u64.u32 %rd12320, %r3395; $L__BB1_1220: shl.b64 %rd8547, %rd12320, 3; add.s64 %rd2909, %rd2887, %rd8547; ld.u32 %rd8548, [%rd2909]; ld.u32 %rd8549, [%rd2909+4]; bfi.b64 %rd2910, %rd8549, %rd8548, 32, 32; add.s64 %rd2911, %rd12320, 1; or.b64 %rd8550, %rd2911, %rd2886; and.b64 %rd8551, %rd8550, -4294967296; setp.eq.s64 %p1693, %rd8551, 0; @%p1693 bra $L__BB1_1222; rem.u64 %rd12321, %rd2911, %rd2886; bra.uni $L__BB1_1223; $L__BB1_1222: cvt.u32.u64 %r3396, %rd2886; cvt.u32.u64 %r3397, %rd2911; rem.u32 %r3398, %r3397, %r3396; cvt.u64.u32 %rd12321, %r3398; $L__BB1_1223: add.u64 %rd12331, %SP, 560; cvta.to.local.u64 %rd12329, %rd12331; shl.b64 %rd8553, %rd12321, 3; add.s64 %rd2921, %rd2887, %rd8553; ld.u32 %rd8554, [%rd2921]; ld.u32 %rd8555, [%rd2921+4]; bfi.b64 %rd8556, %rd8555, %rd8554, 32, 32; st.local.v2.u64 [%rd12329], {%rd2910, %rd8556}; mov.u64 %rd12336, 2; mov.u64 %rd12322, %rd2868; mov.u64 %rd12323, %rd2866; mov.u64 %rd12324, %rd2866; mov.u64 %rd12325, %rd2867; mov.u64 %rd12326, %rd2866; mov.u64 %rd12327, %rd2866; mov.u64 %rd12328, %rd2867; mov.u64 %rd12330, %rd12329; mov.u64 %rd12332, %rd12329; mov.u64 %rd12333, %rd12329; mov.u64 %rd12334, %rd12331; mov.u64 %rd12335, %rd2869; $L__BB1_1224: setp.eq.s64 %p1694, %rd12336, 0; @%p1694 bra $L__BB1_1227; add.s64 %rd12336, %rd12336, -1; add.s64 %rd8557, %rd12323, 8; setp.eq.s64 %p1695, %rd12326, %rd12322; selp.b64 %rd8558, %rd8557, %rd12326, %p1695; add.s64 %rd8559, %rd12324, 8; selp.b64 %rd8560, %rd8559, %rd12327, %p1695; add.s64 %rd8561, %rd12325, 8; selp.b64 %rd8562, %rd8561, %rd12328, %p1695; setp.eq.s64 %p1696, %rd12336, 0; add.s64 %rd8563, %rd8558, 4; add.s64 %rd8564, %rd8560, 4; add.s64 %rd8565, %rd8562, 4; selp.b64 %rd2938, %rd8558, %rd8563, %p1696; selp.b64 %rd12327, %rd8560, %rd8564, %p1696; selp.b64 %rd12328, %rd8562, %rd8565, %p1696; selp.b64 %rd12323, %rd8557, %rd12323, %p1695; selp.b64 %rd12324, %rd8559, %rd12324, %p1695; selp.b64 %rd12325, %rd8561, %rd12325, %p1695; add.s64 %rd8566, %rd12326, 8; selp.b64 %rd12322, %rd8566, %rd12322, %p1695; add.s64 %rd8567, %rd12332, 8; setp.eq.s64 %p1697, %rd12329, %rd12335; selp.b64 %rd8568, %rd8567, %rd12329, %p1697; add.s64 %rd8569, %rd12333, 8; selp.b64 %rd8570, %rd8569, %rd12330, %p1697; add.s64 %rd8571, %rd12334, 8; selp.b64 %rd8572, %rd8571, %rd12331, %p1697; selp.b64 %rd12332, %rd8567, %rd12332, %p1697; selp.b64 %rd12333, %rd8569, %rd12333, %p1697; selp.b64 %rd12334, %rd8571, %rd12334, %p1697; add.s64 %rd8573, %rd12329, 8; selp.b64 %rd12335, %rd8573, %rd12335, %p1697; add.s64 %rd8574, %rd8568, 4; add.s64 %rd8575, %rd8570, 4; add.s64 %rd8576, %rd8572, 4; selp.b64 %rd12329, %rd8568, %rd8574, %p1696; selp.b64 %rd12330, %rd8570, %rd8575, %p1696; selp.b64 %rd12331, %rd8572, %rd8576, %p1696; ld.local.f32 %f3639, [%rd8570]; ld.local.f32 %f3640, [%rd8560]; setp.eq.f32 %p1698, %f3640, %f3639; mov.u64 %rd12326, %rd2938; @%p1698 bra $L__BB1_1224; bra.uni $L__BB1_1226; $L__BB1_1227: ld.u32 %rd8577, [%rd2909]; ld.u32 %rd8578, [%rd2909+4]; bfi.b64 %rd8579, %rd8578, %rd8577, 32, 32; cvt.u32.u64 %r3399, %rd8579; mov.b32 %f3641, %r3399; shr.u64 %rd8580, %rd8579, 32; cvt.u32.u64 %r3400, %rd8580; mov.b32 %f3642, %r3400; ld.u32 %rd8581, [%rd2921]; ld.u32 %rd8582, [%rd2921+4]; bfi.b64 %rd8583, %rd8582, %rd8581, 32, 32; cvt.u32.u64 %r3401, %rd8583; shr.u64 %rd8584, %rd8583, 32; cvt.u32.u64 %r3402, %rd8584; mov.b32 %f3643, %r3401; sub.f32 %f5419, %f3643, %f3641; mov.b32 %f3644, %r3402; sub.f32 %f5420, %f3644, %f3642; bra.uni $L__BB1_1238; $L__BB1_1232: cvt.u32.u64 %r3403, %rd2886; cvt.u32.u64 %r3404, %rd2952; rem.u32 %r3405, %r3404, %r3403; cvt.u64.u32 %rd12337, %r3405; $L__BB1_1233: shl.b64 %rd8593, %rd12337, 3; add.s64 %rd8594, %rd2887, %rd8593; ld.u32 %rd8595, [%rd8594]; ld.u32 %rd8596, [%rd8594+4]; bfi.b64 %rd2963, %rd8596, %rd8595, 32, 32; add.u64 %rd8598, %SPL, 560; st.local.v2.u64 [%rd8598], {%rd2953, %rd2963}; mov.u64 %rd12352, 2; mov.u64 %rd12338, %rd2866; mov.u64 %rd12339, %rd2863; mov.u64 %rd12340, %rd2863; mov.u64 %rd12341, %rd2865; mov.u64 %rd12342, %rd2863; mov.u64 %rd12343, %rd2863; mov.u64 %rd12344, %rd2865; mov.u64 %rd12345, %rd2870; mov.u64 %rd12346, %rd2870; mov.u64 %rd12347, %rd2871; mov.u64 %rd12348, %rd2870; mov.u64 %rd12349, %rd2870; mov.u64 %rd12350, %rd2871; mov.u64 %rd12351, %rd2872; $L__BB1_1234: setp.eq.s64 %p1702, %rd12352, 0; @%p1702 bra $L__BB1_1237; add.s64 %rd12352, %rd12352, -1; add.s64 %rd8599, %rd12339, 8; setp.eq.s64 %p1703, %rd12342, %rd12338; selp.b64 %rd8600, %rd8599, %rd12342, %p1703; add.s64 %rd8601, %rd12340, 8; selp.b64 %rd8602, %rd8601, %rd12343, %p1703; add.s64 %rd8603, %rd12341, 8; selp.b64 %rd8604, %rd8603, %rd12344, %p1703; setp.eq.s64 %p1704, %rd12352, 0; add.s64 %rd8605, %rd8600, 4; add.s64 %rd8606, %rd8602, 4; add.s64 %rd8607, %rd8604, 4; selp.b64 %rd2980, %rd8600, %rd8605, %p1704; selp.b64 %rd12343, %rd8602, %rd8606, %p1704; selp.b64 %rd12344, %rd8604, %rd8607, %p1704; selp.b64 %rd12339, %rd8599, %rd12339, %p1703; selp.b64 %rd12340, %rd8601, %rd12340, %p1703; selp.b64 %rd12341, %rd8603, %rd12341, %p1703; add.s64 %rd8608, %rd12342, 8; selp.b64 %rd12338, %rd8608, %rd12338, %p1703; add.s64 %rd8609, %rd12348, 8; setp.eq.s64 %p1705, %rd12345, %rd12351; selp.b64 %rd8610, %rd8609, %rd12345, %p1705; add.s64 %rd8611, %rd12349, 8; selp.b64 %rd8612, %rd8611, %rd12346, %p1705; add.s64 %rd8613, %rd12350, 8; selp.b64 %rd8614, %rd8613, %rd12347, %p1705; selp.b64 %rd12348, %rd8609, %rd12348, %p1705; selp.b64 %rd12349, %rd8611, %rd12349, %p1705; selp.b64 %rd12350, %rd8613, %rd12350, %p1705; add.s64 %rd8615, %rd12345, 8; selp.b64 %rd12351, %rd8615, %rd12351, %p1705; add.s64 %rd8616, %rd8610, 4; add.s64 %rd8617, %rd8612, 4; add.s64 %rd8618, %rd8614, 4; selp.b64 %rd12345, %rd8610, %rd8616, %p1704; selp.b64 %rd12346, %rd8612, %rd8617, %p1704; selp.b64 %rd12347, %rd8614, %rd8618, %p1704; ld.local.f32 %f3645, [%rd8612]; ld.local.f32 %f3646, [%rd8602]; setp.eq.f32 %p1706, %f3646, %f3645; mov.u64 %rd12342, %rd2980; @%p1706 bra $L__BB1_1234; bra.uni $L__BB1_1236; $L__BB1_1237: cvt.u32.u64 %r3406, %rd2953; mov.b32 %f3647, %r3406; shr.u64 %rd8619, %rd2953, 32; cvt.u32.u64 %r3407, %rd8619; mov.b32 %f3648, %r3407; shr.u64 %rd8620, %rd2963, 32; cvt.u32.u64 %r3408, %rd8620; cvt.u32.u64 %r3409, %rd2963; mov.b32 %f3649, %r3409; sub.f32 %f3650, %f3649, %f3647; mov.b32 %f3651, %r3408; sub.f32 %f3652, %f3651, %f3648; neg.f32 %f5419, %f3650; neg.f32 %f5420, %f3652; $L__BB1_1238: mul.f32 %f3653, %f893, %f5420; fma.rn.f32 %f900, %f892, %f5419, %f3653; mul.f32 %f3654, %f5420, %f5420; fma.rn.f32 %f3655, %f5419, %f5419, %f3654; add.f32 %f3656, %f3655, 0f00000000; sqrt.rn.f32 %f3657, %f3656; mul.f32 %f3658, %f3657, 0f3A83126F; abs.f32 %f3659, %f900; setp.gt.f32 %p1707, %f3659, %f3658; @%p1707 bra $L__BB1_1240; bra.uni $L__BB1_1239; $L__BB1_1240: setp.ge.f32 %p2938, %f900, 0f00000000; bra.uni $L__BB1_1243; $L__BB1_1239: ld.local.u64 %rd8621, [%rd2885+8]; cvt.u32.u64 %r3410, %rd8621; mov.b32 %f3660, %r3410; shr.u64 %rd8622, %rd8621, 32; cvt.u32.u64 %r3411, %rd8622; mov.b32 %f3661, %r3411; sub.f32 %f3662, %f859, %f3660; sub.f32 %f3663, %f860, %f3661; mul.f32 %f3664, %f893, %f3663; fma.rn.f32 %f3665, %f892, %f3662, %f3664; setp.le.f32 %p2938, %f3665, 0f00000000; $L__BB1_1243: selp.u16 %rs637, 1, 0, %p2938; st.local.u8 [%rd2885+16], %rs637; $L__BB1_1244: ld.local.v2.u32 {%r5149, %r5150}, [%rd2885+8]; ld.local.u32 %r5151, [%rd2885+16]; $L__BB1_1246: setp.eq.s32 %p1708, %r918, 2; mov.u64 %rd8630, 0; mov.u64 %rd12353, 2; mov.u64 %rd12354, %rd8630; @%p1708 bra $L__BB1_1248; setp.ne.s16 %p1709, %rs117, 0; cvt.u16.u32 %rs639, %r5151; selp.u16 %rs640, 1, 0, %p1709; xor.b16 %rs641, %rs639, %rs640; mov.b32 %f3672, %r5149; mov.b32 %f3673, %r5150; mul.f32 %f3674, %f863, %f3672; ld.global.f32 %f3675, [%rd2884+-20]; mul.f32 %f3676, %f3675, %f3673; sub.f32 %f3677, %f3674, %f3676; mul.f32 %f3678, %f3675, %f3672; fma.rn.f32 %f3679, %f863, %f3673, %f3678; add.f32 %f3680, %f861, %f3677; mov.b32 %r3416, %f3680; add.f32 %f3681, %f862, %f3679; mov.b32 %r3417, %f3681; cvt.u64.u32 %rd8631, %r3417; cvt.u64.u32 %rd8632, %r3416; cvt.u64.u16 %rd8633, %rs641; bfi.b64 %rd12354, %rd8631, %rd8632, 32, 32; and.b64 %rd8634, %rd8633, 255; mov.b64 {%r3418, %r3419}, %rd8634; mov.b32 {%rs642, %rs643}, %r3418; cvt.u64.u16 %rd12353, %rs642; $L__BB1_1248: or.b64 %rd8635, %rd8630, %rd8630; or.b64 %rd8636, %rd12353, %rd8630; or.b64 %rd8637, %rd8636, %rd8630; or.b64 %rd8638, %rd8635, %rd12354; mov.b64 {%r5211, %r5212}, %rd8638; mov.b64 {%r5213, %r3420}, %rd8637; $L__BB1_1416: mov.b32 {%rs138, %rs711}, %r5213; and.b16 %rs712, %rs138, 255; setp.eq.s16 %p1954, %rs712, 2; @%p1954 bra $L__BB1_1418; mov.b64 %rd9115, {%r5213, %r3671}; shr.u64 %rd9116, %rd9115, 8; and.b64 %rd9117, %rd9116, 16777215; cvt.u64.u16 %rd9118, %rs138; and.b64 %rd9119, %rd9118, 255; mov.b64 %rd8507, {%r5211, %r5212}; bfi.b64 %rd3413, %rd9117, %rd9119, 8, 56; mov.b64 {%r3341, %r3672}, %rd3413; $L__BB1_1418: mov.b32 {%rs713, %rs714}, %r3341; and.b16 %rs715, %rs713, 255; setp.eq.s16 %p1955, %rs715, 2; cvt.u64.u16 %rd9120, %rs713; and.b64 %rd9121, %rd9120, 255; selp.b64 %rd9122, 2, %rd9121, %p1955; mov.b64 %rd9123, {%r3341, %r3673}; and.b64 %rd9124, %rd9123, 4294967040; or.b64 %rd3419, %rd9124, %rd9122; mov.b64 {%r3674, %r3675}, %rd3419; mov.b32 {%rs139, %rs716}, %r3674; and.b16 %rs717, %rs139, 255; setp.eq.s16 %p1956, %rs717, 2; @%p1956 bra $L__BB1_1420; bra.uni $L__BB1_1419; $L__BB1_1420: setp.ne.s64 %p1957, %rd2883, 0; add.s64 %rd12312, %rd2881, 280; add.s64 %rd12313, %rd2882, 280; @%p1957 bra $L__BB1_1187; $L__BB1_1421: add.s64 %rd3469, %rd2881, 280; add.s64 %rd3471, %rd2882, 280; mov.u64 %rd8507, %rd8482; bra.uni $L__BB1_1422; $L__BB1_1419: add.s64 %rd3469, %rd2881, 280; add.s64 %rd3471, %rd2882, 280; shl.b64 %rd9125, %rd3419, 16; shr.u64 %rd9126, %rd9125, 24; cvt.u64.u16 %rd9127, %rs139; and.b64 %rd9128, %rd9127, 255; bfi.b64 %rd9129, %rd9126, %rd9128, 8, 56; mov.b64 {%r3338, %r3676}, %rd9129; $L__BB1_1422: mov.b32 {%rs718, %rs719}, %r3338; and.b16 %rs720, %rs718, 255; setp.eq.s16 %p1958, %rs720, 2; cvt.u64.u16 %rd9132, %rs718; and.b64 %rd9133, %rd9132, 255; selp.b64 %rd9134, 2, %rd9133, %p1958; mov.b64 %rd9135, {%r3338, %r3681}; and.b64 %rd9136, %rd9135, 4294967040; or.b64 %rd9137, %rd9136, %rd8482; or.b64 %rd3433, %rd9137, %rd9134; mov.b64 {%r3682, %r3683}, %rd3433; mov.b32 {%rs140, %rs721}, %r3682; and.b16 %rs722, %rs140, 255; setp.eq.s16 %p1959, %rs722, 2; mov.u32 %r1259, 0; @%p1959 bra $L__BB1_1656; and.b64 %rd9138, %rd3433, 4294967040; cvt.u64.u16 %rd9139, %rs140; and.b64 %rd9140, %rd9139, 255; or.b64 %rd9141, %rd9140, %rd8482; or.b64 %rd9142, %rd9141, %rd9138; mov.b64 {%r3685, %r3686}, %rd9142; mov.b32 {%rs723, %rs724}, %r3685; shr.u64 %rd9143, %rd8507, 32; cvt.u32.u64 %r3687, %rd9143; cvt.u32.u64 %r3688, %rd8507; mov.b32 %f3987, %r3688; sub.f32 %f3988, %f3987, %f859; mov.b32 %f3989, %r3687; sub.f32 %f3990, %f3989, %f860; mul.f32 %f3991, %f3990, %f3990; fma.rn.f32 %f3992, %f3988, %f3988, %f3991; add.f32 %f3993, %f3992, 0f00000000; sqrt.rn.f32 %f3994, %f3993; and.b16 %rs725, %rs723, 1; setp.eq.b16 %p1960, %rs725, 1; selp.f32 %f3995, 0fBF800000, 0f3F800000, %p1960; mul.f32 %f1207, %f3995, %f3994; setp.eq.s64 %p1961, %rd3471, 0; setp.eq.s64 %p1962, %rd2883, 0; or.pred %p1963, %p1961, %p1962; mov.u32 %r1259, 1; @%p1963 bra $L__BB1_1656; add.u64 %rd9144, %SP, 560; add.u64 %rd3434, %SPL, 560; add.u64 %rd9148, %SP, 32; add.u64 %rd3438, %SPL, 32; add.s64 %rd3440, %rd3434, 8; add.u64 %rd9151, %SP, 0; add.u64 %rd3442, %SPL, 0; add.s64 %rd3443, %rd3442, 8; add.s64 %rd3445, %rd3442, 8; add.s64 %rd3447, %rd3442, 8; add.s64 %rd3449, %rd3442, 8; add.s64 %rd3451, %rd3442, 8; add.s64 %rd3453, %rd3442, 8; add.u64 %rd9157, %SP, 552; add.u64 %rd3454, %SPL, 552; add.s64 %rd3455, %rd3454, 8; add.s64 %rd3457, %rd3438, 36; add.s64 %rd3459, %rd3438, 4; add.s64 %rd3460, %rd9148, 36; add.s64 %rd3461, %rd3438, 44; add.s64 %rd3462, %rd9148, 44; add.s64 %rd3463, %rd3438, 52; add.s64 %rd3464, %rd3434, 8; add.s64 %rd3465, %rd3434, 8; or.b64 %rd3466, %rd9144, 8; add.s64 %rd3467, %rd3434, 16; mov.u64 %rd3470, %rd3469; $L__BB1_1425: add.s64 %rd2883, %rd2883, -1; ld.global.u32 %r3689, [%rd3469+272]; setp.eq.s32 %p1964, %r3689, 3; @%p1964 bra $L__BB1_1655; ld.global.u16 %rs726, [%rd3470]; setp.eq.s16 %p1965, %rs726, 1; @%p1965 bra $L__BB1_1597; setp.eq.s16 %p1966, %rs726, 2; @%p1966 bra $L__BB1_1486; setp.ne.s16 %p1967, %rs726, 3; @%p1967 bra $L__BB1_1635; ld.global.u8 %rs141, [%rd3470+24]; ld.global.f32 %f1029, [%rd3470+256]; sub.f32 %f3996, %f859, %f1029; ld.global.f32 %f1030, [%rd3470+260]; sub.f32 %f3997, %f860, %f1030; ld.global.f32 %f1031, [%rd3470+252]; ld.global.f32 %f1032, [%rd3470+248]; mul.f32 %f3998, %f3997, %f1031; fma.rn.f32 %f1033, %f3996, %f1032, %f3998; mul.f32 %f3999, %f3996, %f1031; mul.f32 %f4000, %f3997, %f1032; sub.f32 %f1034, %f4000, %f3999; mov.u32 %r1106, 2; st.local.u32 [%rd3438+20], %r1106; ld.global.u64 %rd3475, [%rd3470+16]; setp.eq.s64 %p1969, %rd3475, 0; mov.pred %p2945, -1; @%p1969 bra $L__BB1_1483; mov.b32 %r3705, %f1034; ld.global.u64 %rd3476, [%rd3470+8]; mov.b32 %r3706, %f1033; and.b32 %r3707, %r3706, 2147483647; mov.b32 %f1035, %r3707; and.b32 %r3708, %r3705, 2147483647; mov.b32 %f1036, %r3708; mov.u64 %rd12506, 1; bra.uni $L__BB1_1431; $L__BB1_1439: sub.f32 %f4012, %f5436, %f1033; abs.f32 %f1051, %f4012; setp.le.f32 %p1979, %f1051, 0f34000000; @%p1979 bra $L__BB1_1441; abs.f32 %f4013, %f5436; abs.f32 %f4014, %f1033; setp.gt.f32 %p1981, %f4014, %f4013; selp.f32 %f4015, %f4014, %f4013, %p1981; mul.f32 %f4016, %f4015, 0f34000000; setp.gtu.f32 %p1982, %f1051, %f4016; @%p1982 bra $L__BB1_1445; bra.uni $L__BB1_1441; $L__BB1_1431: shl.b64 %rd9162, %rd12506, 3; add.s64 %rd9163, %rd3476, %rd9162; setp.eq.s64 %p1970, %rd12506, %rd3475; selp.b64 %rd9164, 0, %rd12506, %p1970; shl.b64 %rd9165, %rd9164, 3; add.s64 %rd9166, %rd3476, %rd9165; ld.u32 %rd9167, [%rd9166]; ld.u32 %rd9168, [%rd9166+4]; bfi.b64 %rd3483, %rd9168, %rd9167, 32, 32; ld.u32 %rd9169, [%rd9163+-8]; ld.u32 %rd9170, [%rd9163+-4]; bfi.b64 %rd3484, %rd9170, %rd9169, 32, 32; cvt.u32.u64 %r5218, %rd3484; mov.b32 %f5436, %r5218; shr.u64 %rd9171, %rd3484, 32; cvt.u32.u64 %r3712, %rd9171; mov.b32 %f1039, %r3712; cvt.u32.u64 %r1089, %rd3483; shr.u64 %rd9172, %rd3483, 32; cvt.u32.u64 %r3713, %rd9172; mov.b32 %f1040, %r1089; sub.f32 %f1041, %f1040, %f5436; mov.b32 %f4002, %r3713; sub.f32 %f1042, %f4002, %f1039; sub.f32 %f4003, %f1033, %f5436; sub.f32 %f4004, %f1034, %f1039; mul.f32 %f4005, %f1042, %f4004; fma.rn.f32 %f1043, %f1041, %f4003, %f4005; mul.f32 %f4006, %f1042, %f1042; fma.rn.f32 %f4007, %f1041, %f1041, %f4006; add.f32 %f1044, %f4007, 0f00000000; setp.gtu.f32 %p1971, %f1043, 0f00000000; mov.b64 {%r3714, %r5219}, %rd3484; mov.b64 {%r3715, %r1091}, %rd3483; @%p1971 bra $L__BB1_1433; bra.uni $L__BB1_1432; $L__BB1_1433: setp.ltu.f32 %p1972, %f1043, %f1044; @%p1972 bra $L__BB1_1435; bra.uni $L__BB1_1434; $L__BB1_1435: setp.eq.f32 %p1973, %f1044, 0f00000000; @%p1973 bra $L__BB1_1482; div.rn.f32 %f4008, %f1043, %f1044; mov.f32 %f4009, 0f3F800000; sub.f32 %f4010, %f4009, %f4008; mov.b32 %r5221, %f4010; mov.b32 %r5222, %f4008; fma.rn.f32 %f5436, %f1041, %f4008, %f5436; mov.b32 %r5218, %f5436; fma.rn.f32 %f5437, %f1042, %f4008, %f1039; mov.b32 %r5219, %f5437; mov.u32 %r5220, 1; bra.uni $L__BB1_1437; $L__BB1_1432: mov.b32 %f5437, %r5219; mov.u32 %r5220, 0; mov.u32 %r5221, %r5220; bra.uni $L__BB1_1437; $L__BB1_1434: mov.b32 %f5437, %r1091; mov.u32 %r5221, 1; mov.u32 %r5220, 0; mov.f32 %f5436, %f1040; mov.u32 %r5218, %r1089; mov.u32 %r5219, %r1091; $L__BB1_1437: setp.eq.f32 %p1974, %f1033, %f5436; @%p1974 bra $L__BB1_1441; bra.uni $L__BB1_1438; $L__BB1_1441: setp.eq.f32 %p1984, %f5437, %f1034; mov.pred %p1983, -1; mov.pred %p2943, %p1983; @%p1984 bra $L__BB1_1445; setp.eq.f32 %p1986, %f1036, 0f7F800000; and.b32 %r3724, %r5219, 2147483647; mov.b32 %f4017, %r3724; setp.eq.f32 %p1987, %f4017, 0f7F800000; or.pred %p1988, %p1986, %p1987; mov.pred %p2943, 0; @%p1988 bra $L__BB1_1445; sub.f32 %f4018, %f5437, %f1034; abs.f32 %f1052, %f4018; setp.le.f32 %p1990, %f1052, 0f34000000; mov.pred %p2943, %p1983; @%p1990 bra $L__BB1_1445; abs.f32 %f4019, %f5437; abs.f32 %f4020, %f1034; setp.gt.f32 %p1991, %f4020, %f4019; selp.f32 %f4021, %f4020, %f4019, %p1991; mul.f32 %f4022, %f4021, 0f34000000; setp.le.f32 %p2943, %f1052, %f4022; bra.uni $L__BB1_1445; $L__BB1_1438: setp.eq.f32 %p1976, %f1035, 0f7F800000; and.b32 %r3723, %r5218, 2147483647; mov.b32 %f4011, %r3723; setp.eq.f32 %p1977, %f4011, 0f7F800000; or.pred %p1978, %p1976, %p1977; mov.pred %p2943, 0; @%p1978 bra $L__BB1_1445; bra.uni $L__BB1_1439; $L__BB1_1445: cvt.u64.u32 %rd9173, %r5219; cvt.u64.u32 %rd9174, %r5218; bfi.b64 %rd3485, %rd9173, %rd9174, 32, 32; mov.b64 {%r3725, %r3726}, %rd3485; selp.u64 %rd3486, 1, 0, %p2943; mov.b32 %f1054, %r3726; mov.b32 %f1053, %r3725; sub.f32 %f4023, %f1053, %f1033; sub.f32 %f4024, %f1054, %f1034; mul.f32 %f4025, %f4024, %f4024; fma.rn.f32 %f4026, %f4023, %f4023, %f4025; add.f32 %f4027, %f4026, 0f00000000; sqrt.rn.f32 %f1056, %f4027; setp.geu.f32 %p1992, %f1056, %f5438; setp.ne.s32 %p1993, %r1106, 2; and.pred %p1994, %p1993, %p1992; @%p1994 bra $L__BB1_1447; add.s64 %rd12507, %rd12506, -1; st.local.u64 [%rd3438], %rd12507; st.local.v2.f32 [%rd3438+8], {%f1053, %f1054}; mov.b64 {%r3729, %r3730}, %rd3486; st.local.v2.u32 [%rd3438+16], {%r3729, %r5220}; st.local.v2.u32 [%rd3438+24], {%r5221, %r5222}; st.local.f32 [%rd3438+32], %f1056; st.local.u32 [%rd3438+36], %rd3484; st.local.u32 [%rd3438+44], %rd3483; st.local.u32 [%rd3438+40], %rd9171; st.local.u32 [%rd3438+48], %rd9172; mov.u32 %r5223, %r5221; mov.u64 %rd12508, %rd3484; mov.u64 %rd12509, %rd3483; mov.f32 %f5438, %f1056; mov.u32 %r1106, %r5220; $L__BB1_1447: add.s64 %rd3491, %rd12506, 1; setp.lt.u64 %p1995, %rd12506, %rd3475; mov.u64 %rd12506, %rd3491; @%p1995 bra $L__BB1_1431; cvt.u32.u64 %r3731, %rd12508; mov.b32 %f4028, %r3731; shr.u64 %rd9181, %rd12508, 32; cvt.u32.u64 %r3732, %rd9181; mov.b32 %f4029, %r3732; shr.u64 %rd9182, %rd12509, 32; cvt.u32.u64 %r3733, %rd9182; cvt.u32.u64 %r3734, %rd12509; mov.b32 %f4030, %r3734; sub.f32 %f1058, %f4030, %f4028; mov.b32 %f4031, %r3733; sub.f32 %f1059, %f4031, %f4029; mul.f32 %f4032, %f1059, %f1059; fma.rn.f32 %f4033, %f1058, %f1058, %f4032; add.f32 %f1060, %f4033, 0f00000000; setp.leu.f32 %p1996, %f1060, 0f28800000; mov.u64 %rd9180, 0; mov.u64 %rd12510, %rd9180; mov.u64 %rd12511, %rd9180; mov.u64 %rd12512, %rd9180; @%p1996 bra $L__BB1_1450; neg.f32 %f4034, %f1058; sqrt.rn.f32 %f4035, %f1060; div.rn.f32 %f4036, %f1059, %f4035; div.rn.f32 %f4037, %f4034, %f4035; mov.b32 %r3735, %f4037; mov.b32 %r3736, %f4036; mov.u64 %rd12512, 1; mov.b64 %rd9185, {%r3736, %r3735}; shr.u64 %rd12511, %rd9185, 32; shl.b64 %rd12510, %rd9185, 32; $L__BB1_1450: or.b64 %rd3498, %rd12512, %rd12510; or.b64 %rd3499, %rd9180, %rd12511; and.b64 %rd9186, %rd9180, 4294967295; xor.b64 %rd9187, %rd12512, 1; or.b64 %rd9188, %rd9187, %rd9186; setp.ne.s64 %p1997, %rd9188, 0; @%p1997 bra $L__BB1_1481; mov.b64 {%r3737, %r3738}, %rd3499; mov.b64 {%r3739, %r3740}, %rd3498; mov.b32 %f1061, %r3740; mov.b32 %f1062, %r3737; setp.eq.s32 %p1998, %r1106, 1; @%p1998 bra $L__BB1_1479; bra.uni $L__BB1_1452; $L__BB1_1479: ld.local.u64 %rd9265, [%rd3438+8]; cvt.u32.u64 %r3760, %rd9265; mov.b32 %f4065, %r3760; shr.u64 %rd9266, %rd9265, 32; cvt.u32.u64 %r3761, %rd9266; mov.b32 %f4066, %r3761; sub.f32 %f4067, %f859, %f4065; sub.f32 %f4068, %f860, %f4066; mul.f32 %f4069, %f1062, %f4068; fma.rn.f32 %f4070, %f1061, %f4067, %f4069; setp.le.f32 %p2944, %f4070, 0f00000000; bra.uni $L__BB1_1480; $L__BB1_1486: ld.global.f32 %f1070, [%rd3470+256]; mov.u64 %rd9286, 0; sub.f32 %f4080, %f859, %f1070; ld.global.f32 %f1071, [%rd3470+260]; sub.f32 %f4081, %f860, %f1071; ld.global.f32 %f1072, [%rd3470+252]; ld.global.f32 %f1073, [%rd3470+248]; mul.f32 %f4082, %f4081, %f1072; fma.rn.f32 %f1074, %f4080, %f1073, %f4082; mul.f32 %f4083, %f4080, %f1072; mul.f32 %f4084, %f4081, %f1073; sub.f32 %f1075, %f4084, %f4083; mov.b32 %r3769, %f1074; mov.b32 %r3770, %f1075; cvt.u64.u32 %rd9287, %r3770; cvt.u64.u32 %rd9288, %r3769; bfi.b64 %rd9289, %rd9287, %rd9288, 32, 32; st.local.u64 [%rd3454], %rd9289; ld.global.u64 %rd3601, [%rd3470+32]; setp.eq.s64 %p2018, %rd3601, 0; mov.u64 %rd9284, 2; mov.u64 %rd12666, %rd9286; mov.u64 %rd12667, %rd9284; mov.u64 %rd12668, %rd9286; @%p2018 bra $L__BB1_1592; mov.u32 %r3777, 0; st.local.u32 [%rd3438], %r3777; mov.u32 %r3778, -16777217; st.local.u32 [%rd3438+4], %r3778; mov.u32 %r1127, 1; st.local.u32 [%rd3438+512], %r1127; ld.global.u64 %rd3603, [%rd3470+24]; ld.global.u64 %rd3604, [%rd3470+80]; ld.global.u64 %rd3605, [%rd3470+72]; mov.u32 %r1125, 2139095039; mov.u32 %r1124, 4; bra.uni $L__BB1_1488; $L__BB1_1597: ld.global.f32 %f1141, [%rd3470+256]; sub.f32 %f4276, %f859, %f1141; ld.global.f32 %f1142, [%rd3470+260]; sub.f32 %f4277, %f860, %f1142; ld.global.f32 %f1143, [%rd3470+252]; ld.global.f32 %f1144, [%rd3470+248]; mul.f32 %f4278, %f4277, %f1143; fma.rn.f32 %f1145, %f4276, %f1144, %f4278; mul.f32 %f4279, %f4276, %f1143; mul.f32 %f4280, %f4277, %f1144; sub.f32 %f1146, %f4280, %f4279; mov.b32 %r1228, %f1145; mov.b32 %r1229, %f1146; ld.global.v2.f32 {%f4281, %f4282}, [%rd3470+56]; ld.global.v2.f32 {%f4283, %f4284}, [%rd3470+48]; sub.f32 %f4285, %f1145, %f6; sub.f32 %f4286, %f1146, %f6; mov.b32 %r3949, %f4285; mov.b32 %r3950, %f4286; cvt.u64.u32 %rd9658, %r3950; cvt.u64.u32 %rd9659, %r3949; add.f32 %f4287, %f6, %f1145; add.f32 %f4288, %f6, %f1146; mov.b32 %r3951, %f4287; mov.b32 %r3952, %f4288; cvt.u64.u32 %rd9660, %r3952; cvt.u64.u32 %rd9661, %r3951; bfi.b64 %rd9662, %rd9658, %rd9659, 32, 32; mov.b64 {%r3953, %r3954}, %rd9662; bfi.b64 %rd9663, %rd9660, %rd9661, 32, 32; mov.b64 {%r3955, %r3956}, %rd9663; cvta.to.local.u64 %rd3964, %rd9148; mov.u16 %rs792, 2; st.local.u8 [%rd3964+8], %rs792; mov.b32 %f1154, %r3956; mov.b32 %f1152, %r3954; mov.b32 %f1153, %r3955; mov.b32 %f1151, %r3953; ld.global.v2.f32 {%f4289, %f4290}, [%rd3470+40]; div.rn.f32 %f1157, %f1151, %f4289; div.rn.f32 %f1158, %f1153, %f4289; ld.global.u64 %rd3965, [%rd3470+16]; cvt.rn.f32.u64 %f4291, %rd3965; add.f32 %f4292, %f4291, 0fBF800000; rcp.rn.f32 %f1159, %f4292; setp.lt.f32 %p2185, %f1158, 0fBF000000; setp.gt.f32 %p2186, %f1157, 0f3F000000; or.pred %p2187, %p2186, %p2185; @%p2187 bra $L__BB1_1629; add.f32 %f4293, %f1157, 0f3F000000; div.rn.f32 %f4294, %f4293, %f1159; cvt.rmi.f32.f32 %f4295, %f4294; add.s64 %rd9665, %rd3965, -2; cvt.rn.f32.u64 %f4296, %rd9665; setp.gt.f32 %p2188, %f4295, 0f00000000; setp.lt.f32 %p2189, %f4295, %f4296; selp.f32 %f4297, %f4295, %f4296, %p2189; selp.f32 %f4298, %f4297, 0f00000000, %p2188; setp.gt.f32 %p2190, %f4298, 0f5F7FFFFF; max.f32 %f4299, %f4298, 0f00000000; cvt.rzi.u64.f32 %rd9666, %f4299; selp.b64 %rd3971, -1, %rd9666, %p2190; add.f32 %f4300, %f1158, 0f3F000000; div.rn.f32 %f4301, %f4300, %f1159; cvt.rpi.f32.f32 %f4302, %f4301; add.s64 %rd9667, %rd3965, -1; cvt.rn.f32.u64 %f4303, %rd9667; setp.gt.f32 %p2191, %f4302, 0f00000000; setp.lt.f32 %p2192, %f4302, %f4303; selp.f32 %f4304, %f4302, %f4303, %p2192; selp.f32 %f4305, %f4304, 0f00000000, %p2191; setp.gt.f32 %p2193, %f4305, 0f5F7FFFFF; max.f32 %f4306, %f4305, 0f00000000; cvt.rzi.u64.f32 %rd9668, %f4306; selp.b64 %rd3967, -1, %rd9668, %p2193; setp.ge.u64 %p2194, %rd3971, %rd3967; @%p2194 bra $L__BB1_1629; div.rn.f32 %f1160, %f1152, %f4290; div.rn.f32 %f1161, %f1154, %f4290; ld.global.u64 %rd3968, [%rd3470+32]; ld.global.u64 %rd3969, [%rd3470+24]; ld.global.u64 %rd3970, [%rd3470+8]; and.b32 %r3957, %r1228, 2147483647; mov.b32 %f1162, %r3957; and.b32 %r3958, %r1229, 2147483647; mov.b32 %f1163, %r3958; ld.local.v4.u32 {%r5283, %r5284, %r5285, %r3962}, [%rd3964]; mov.f32 %f5450, 0f7F7FFFFF; bra.uni $L__BB1_1600; $L__BB1_1635: ld.global.f32 %f1188, [%rd3470+256]; sub.f32 %f4345, %f859, %f1188; ld.global.f32 %f1189, [%rd3470+260]; sub.f32 %f4346, %f860, %f1189; ld.global.f32 %f1190, [%rd3470+252]; ld.global.f32 %f1191, [%rd3470+248]; mul.f32 %f4347, %f4346, %f1190; fma.rn.f32 %f1192, %f4345, %f1191, %f4347; mul.f32 %f4348, %f4345, %f1190; mul.f32 %f4349, %f4346, %f1191; sub.f32 %f1193, %f4349, %f4348; ld.global.u32 %rd9695, [%rd3470+8]; ld.global.u32 %rd9696, [%rd3470+12]; bfi.b64 %rd9697, %rd9696, %rd9695, 32, 32; cvt.u32.u64 %r3997, %rd9697; mov.b32 %f4350, %r3997; shr.u64 %rd9698, %rd9697, 32; cvt.u32.u64 %r3998, %rd9698; mov.b32 %f4351, %r3998; neg.f32 %f4352, %f4350; neg.f32 %f4353, %f4351; sub.f32 %f1194, %f4352, %f1192; sub.f32 %f1195, %f4353, %f1193; sub.f32 %f1196, %f1192, %f4350; sub.f32 %f1197, %f1193, %f4351; setp.ge.f32 %p2243, %f1194, 0f00000000; selp.f32 %f4354, %f1194, 0f00000000, %p2243; setp.ge.f32 %p2244, %f1195, 0f00000000; selp.f32 %f4355, %f1195, 0f00000000, %p2244; setp.ge.f32 %p2245, %f1196, 0f00000000; selp.f32 %f4356, %f1196, 0f00000000, %p2245; setp.ge.f32 %p2246, %f1197, 0f00000000; selp.f32 %f4357, %f1197, 0f00000000, %p2246; sub.f32 %f1198, %f4354, %f4356; mov.b32 %r3999, %f1198; sub.f32 %f1199, %f4355, %f4357; mov.b32 %r4000, %f1199; cvt.u64.u32 %rd9699, %r4000; cvt.u64.u32 %rd9700, %r3999; bfi.b64 %rd9701, %rd9699, %rd9700, 32, 32; st.local.u64 [%rd3434], %rd9701; mov.u64 %rd12682, 2; mov.u64 %rd12675, %rd3440; mov.u64 %rd12676, %rd3434; mov.u64 %rd12677, %rd3434; mov.u64 %rd12678, %rd9144; mov.u64 %rd12679, %rd3434; mov.u64 %rd12680, %rd3434; mov.u64 %rd12681, %rd9144; $L__BB1_1636: setp.eq.s64 %p2247, %rd12682, 0; @%p2247 bra $L__BB1_1639; add.s64 %rd12682, %rd12682, -1; add.s64 %rd9702, %rd12679, 8; setp.eq.s64 %p2248, %rd12679, %rd12675; selp.b64 %rd12675, %rd9702, %rd12675, %p2248; add.s64 %rd9703, %rd12676, 8; selp.b64 %rd12676, %rd9703, %rd12676, %p2248; add.s64 %rd9704, %rd12677, 8; selp.b64 %rd12677, %rd9704, %rd12677, %p2248; add.s64 %rd9705, %rd12678, 8; selp.b64 %rd12678, %rd9705, %rd12678, %p2248; selp.b64 %rd9706, %rd9703, %rd12679, %p2248; selp.b64 %rd9707, %rd9704, %rd12680, %p2248; selp.b64 %rd9708, %rd9705, %rd12681, %p2248; setp.eq.s64 %p2249, %rd12682, 0; add.s64 %rd9709, %rd9706, 4; add.s64 %rd9710, %rd9707, 4; add.s64 %rd9711, %rd9708, 4; selp.b64 %rd12679, %rd9706, %rd9709, %p2249; selp.b64 %rd12680, %rd9707, %rd9710, %p2249; selp.b64 %rd12681, %rd9708, %rd9711, %p2249; ld.local.f32 %f4358, [%rd9707]; setp.eq.f32 %p2250, %f4358, 0f00000000; @%p2250 bra $L__BB1_1636; add.f32 %f4359, %f1192, %f1198; mov.b32 %r4001, %f4359; add.f32 %f4360, %f1193, %f1199; mov.b32 %r4002, %f4360; cvt.u64.u32 %rd9714, %r4002; cvt.u64.u32 %rd9715, %r4001; bfi.b64 %rd12686, %rd9714, %rd9715, 32, 32; mov.u64 %rd12685, 0; bra.uni $L__BB1_1652; $L__BB1_1639: setp.lt.f32 %p2251, %f1194, %f1196; mov.f32 %f5451, 0fFF7FFFFF; @%p2251 bra $L__BB1_1642; bra.uni $L__BB1_1640; $L__BB1_1642: setp.leu.f32 %p2256, %f1196, 0fFF7FFFFF; mov.pred %p2949, 0; @%p2256 bra $L__BB1_1644; mov.f32 %f5451, %f1196; bra.uni $L__BB1_1644; $L__BB1_1640: setp.leu.f32 %p2253, %f1194, 0fFF7FFFFF; mov.pred %p2949, 0; @%p2253 bra $L__BB1_1644; mov.pred %p2949, -1; mov.f32 %f5451, %f1194; $L__BB1_1644: setp.lt.f32 %p2258, %f1195, %f1197; @%p2258 bra $L__BB1_1647; bra.uni $L__BB1_1645; $L__BB1_1647: setp.gt.f32 %p2260, %f1197, %f5451; @%p2260 bra $L__BB1_1650; bra.uni $L__BB1_1648; $L__BB1_1650: mov.u64 %rd9718, 0; st.local.u64 [%rd3438], %rd9718; neg.f32 %f5453, %f1197; mov.u64 %rd12684, %rd3459; bra.uni $L__BB1_1651; $L__BB1_1645: setp.leu.f32 %p2259, %f1195, %f5451; @%p2259 bra $L__BB1_1648; mov.u64 %rd9716, 0; st.local.u64 [%rd3438], %rd9716; mov.u64 %rd12684, %rd3459; mov.f32 %f5451, %f1195; bra.uni $L__BB1_1649; $L__BB1_1648: mov.u64 %rd9717, 0; st.local.u64 [%rd3438], %rd9717; neg.f32 %f5453, %f5451; not.pred %p2261, %p2949; mov.u64 %rd12684, %rd3438; @%p2261 bra $L__BB1_1651; $L__BB1_1649: mov.f32 %f5453, %f5451; $L__BB1_1651: st.local.f32 [%rd12684], %f5453; ld.local.u64 %rd9721, [%rd3438]; cvt.u32.u64 %r4003, %rd9721; mov.b32 %f4363, %r4003; shr.u64 %rd9722, %rd9721, 32; cvt.u32.u64 %r4004, %rd9722; mov.b32 %f4364, %r4004; add.f32 %f4365, %f1192, %f4363; add.f32 %f4366, %f1193, %f4364; mov.b32 %r4005, %f4365; mov.b32 %r4006, %f4366; cvt.u64.u32 %rd9723, %r4006; cvt.u64.u32 %rd9724, %r4005; bfi.b64 %rd12686, %rd9723, %rd9724, 32, 32; mov.u64 %rd12685, 1; $L__BB1_1652: mov.u64 %rd11242, 0; cvt.u32.u64 %r4007, %rd12686; mov.b32 %f4367, %r4007; shr.u64 %rd9725, %rd12686, 32; cvt.u32.u64 %r4008, %rd9725; mov.b32 %f4368, %r4008; mul.f32 %f4369, %f1191, %f4367; mul.f32 %f4370, %f1190, %f4368; sub.f32 %f4371, %f4369, %f4370; mul.f32 %f4372, %f1191, %f4368; fma.rn.f32 %f4373, %f1190, %f4367, %f4372; add.f32 %f4374, %f1188, %f4371; mov.b32 %r4009, %f4374; add.f32 %f4375, %f1189, %f4373; mov.b32 %r4010, %f4375; cvt.u64.u32 %rd9726, %r4010; cvt.u64.u32 %rd9727, %r4009; bfi.b64 %rd9728, %rd9726, %rd9727, 32, 32; or.b64 %rd9729, %rd11242, %rd9728; mov.b64 {%r5286, %r5287}, %rd9729; mov.b64 {%r5288, %r4011}, %rd12685; bra.uni $L__BB1_1653; $L__BB1_1617: sub.f32 %f4319, %f5448, %f1145; abs.f32 %f1181, %f4319; setp.le.f32 %p2213, %f1181, 0f34000000; @%p2213 bra $L__BB1_1619; abs.f32 %f4320, %f5448; abs.f32 %f4321, %f1145; setp.gt.f32 %p2215, %f4321, %f4320; selp.f32 %f4322, %f4321, %f4320, %p2215; mul.f32 %f4323, %f4322, 0f34000000; setp.gtu.f32 %p2216, %f1181, %f4323; @%p2216 bra $L__BB1_1623; bra.uni $L__BB1_1619; $L__BB1_1600: setp.gt.u64 %p2195, %rd3968, %rd3971; @%p2195 bra $L__BB1_1602; bra.uni $L__BB1_1601; $L__BB1_1602: add.s64 %rd9669, %rd3969, %rd3971; ld.u8 %rs793, [%rd9669]; setp.eq.s16 %p2196, %rs793, 0; @%p2196 bra $L__BB1_1627; cvt.rn.f32.u64 %f4308, %rd3971; fma.rn.f32 %f1165, %f1159, %f4308, 0fBF000000; setp.gt.u64 %p2197, %rd3965, %rd3971; @%p2197 bra $L__BB1_1605; bra.uni $L__BB1_1604; $L__BB1_1605: shl.b64 %rd9670, %rd3971, 2; add.s64 %rd3972, %rd3970, %rd9670; ld.f32 %f1166, [%rd3972]; add.s64 %rd9671, %rd3971, 1; setp.gt.u64 %p2198, %rd3965, %rd9671; @%p2198 bra $L__BB1_1607; bra.uni $L__BB1_1606; $L__BB1_1607: ld.f32 %f1167, [%rd3972+4]; setp.gt.f32 %p2199, %f1167, %f1161; setp.gt.f32 %p2200, %f1166, %f1161; and.pred %p2201, %p2200, %p2199; @%p2201 bra $L__BB1_1627; setp.lt.f32 %p2202, %f1166, %f1160; setp.lt.f32 %p2203, %f1167, %f1160; and.pred %p2204, %p2202, %p2203; @%p2204 bra $L__BB1_1627; mul.f32 %f4309, %f4289, %f1165; mov.b32 %r3963, %f4309; mul.f32 %f1170, %f4290, %f1166; mov.b32 %r3964, %f1170; cvt.u64.u32 %rd9672, %r3964; cvt.u64.u32 %rd9673, %r3963; add.f32 %f4310, %f1159, %f1165; mul.f32 %f1168, %f4289, %f4310; mov.b32 %r1236, %f1168; mul.f32 %f4311, %f4290, %f1167; mov.b32 %r3965, %f4311; cvt.u64.u32 %rd9674, %r3965; cvt.u64.u32 %rd9675, %r1236; bfi.b64 %rd9676, %rd9674, %rd9675, 32, 32; bfi.b64 %rd9677, %rd9672, %rd9673, 32, 32; cvt.u32.u64 %r5281, %rd9677; mov.b32 %f5448, %r5281; sub.f32 %f1171, %f1168, %f5448; sub.f32 %f1172, %f4311, %f1170; sub.f32 %f4312, %f1145, %f5448; sub.f32 %f4313, %f1146, %f1170; mul.f32 %f4314, %f1172, %f4313; fma.rn.f32 %f1173, %f1171, %f4312, %f4314; mul.f32 %f4315, %f1172, %f1172; fma.rn.f32 %f4316, %f1171, %f1171, %f4315; add.f32 %f1174, %f4316, 0f00000000; setp.gtu.f32 %p2205, %f1173, 0f00000000; mov.b64 {%r3966, %r5282}, %rd9677; mov.b64 {%r3967, %r1239}, %rd9676; @%p2205 bra $L__BB1_1611; bra.uni $L__BB1_1610; $L__BB1_1611: setp.ltu.f32 %p2206, %f1173, %f1174; @%p2206 bra $L__BB1_1613; bra.uni $L__BB1_1612; $L__BB1_1613: setp.eq.f32 %p2207, %f1174, 0f00000000; @%p2207 bra $L__BB1_1626; div.rn.f32 %f4317, %f1173, %f1174; fma.rn.f32 %f5448, %f1171, %f4317, %f5448; mov.b32 %r5281, %f5448; fma.rn.f32 %f5449, %f1172, %f4317, %f1170; mov.b32 %r5282, %f5449; bra.uni $L__BB1_1615; $L__BB1_1610: mov.b32 %f5449, %r5282; bra.uni $L__BB1_1615; $L__BB1_1612: mov.b32 %f5449, %r1239; mov.f32 %f5448, %f1168; mov.u32 %r5281, %r1236; mov.u32 %r5282, %r1239; $L__BB1_1615: setp.eq.f32 %p2208, %f1145, %f5448; @%p2208 bra $L__BB1_1619; bra.uni $L__BB1_1616; $L__BB1_1619: setp.eq.f32 %p2218, %f5449, %f1146; mov.pred %p2217, -1; mov.pred %p2947, %p2217; @%p2218 bra $L__BB1_1623; setp.eq.f32 %p2220, %f1163, 0f7F800000; and.b32 %r3969, %r5282, 2147483647; mov.b32 %f4324, %r3969; setp.eq.f32 %p2221, %f4324, 0f7F800000; or.pred %p2222, %p2220, %p2221; mov.pred %p2947, 0; @%p2222 bra $L__BB1_1623; sub.f32 %f4325, %f5449, %f1146; abs.f32 %f1182, %f4325; setp.le.f32 %p2224, %f1182, 0f34000000; mov.pred %p2947, %p2217; @%p2224 bra $L__BB1_1623; abs.f32 %f4326, %f5449; abs.f32 %f4327, %f1146; setp.gt.f32 %p2225, %f4327, %f4326; selp.f32 %f4328, %f4327, %f4326, %p2225; mul.f32 %f4329, %f4328, 0f34000000; setp.le.f32 %p2947, %f1182, %f4329; bra.uni $L__BB1_1623; $L__BB1_1616: setp.eq.f32 %p2210, %f1162, 0f7F800000; and.b32 %r3968, %r5281, 2147483647; mov.b32 %f4318, %r3968; setp.eq.f32 %p2211, %f4318, 0f7F800000; or.pred %p2212, %p2210, %p2211; mov.pred %p2947, 0; @%p2212 bra $L__BB1_1623; bra.uni $L__BB1_1617; $L__BB1_1623: cvt.u64.u32 %rd9678, %r5282; cvt.u64.u32 %rd9679, %r5281; bfi.b64 %rd3973, %rd9678, %rd9679, 32, 32; mov.b64 {%r3970, %r3971}, %rd3973; selp.u64 %rd3974, 1, 0, %p2947; mov.b32 %f4330, %r3970; sub.f32 %f4331, %f4330, %f1145; mov.b32 %f4332, %r3971; sub.f32 %f4333, %f4332, %f1146; mul.f32 %f4334, %f4333, %f4333; fma.rn.f32 %f4335, %f4331, %f4331, %f4334; add.f32 %f1183, %f4335, 0f00000000; setp.geu.f32 %p2226, %f1183, %f5450; @%p2226 bra $L__BB1_1627; sqrt.rn.f32 %f4336, %f1183; setp.gtu.f32 %p2227, %f4336, %f6; mov.f32 %f5450, %f1183; @%p2227 bra $L__BB1_1627; mov.b64 {%r5285, %r3972}, %rd3974; mov.u32 %r5283, %r3970; mov.u32 %r5284, %r3971; mov.f32 %f5450, %f1183; $L__BB1_1627: add.s64 %rd3971, %rd3971, 1; setp.lt.u64 %p2228, %rd3971, %rd3967; @%p2228 bra $L__BB1_1600; st.local.u32 [%rd3964+8], %r5285; mov.b64 %rd9680, {%r5283, %r5284}; st.local.u64 [%rd3964], %rd9680; $L__BB1_1629: cvt.u64.u32 %rd9681, %r1228; cvt.u64.u32 %rd9682, %r1229; bfi.b64 %rd3976, %rd9682, %rd9681, 32, 32; ld.local.v4.u32 {%r3976, %r3977, %r3978, %r3979}, [%rd3964]; mov.b64 %rd3978, {%r3978, %r3979}; mov.b64 %rd3977, {%r3976, %r3977}; mov.b32 {%rs794, %rs795}, %r3978; and.b16 %rs796, %rs794, 255; setp.eq.s16 %p2229, %rs796, 2; cvt.u64.u16 %rd9683, %rs794; and.b64 %rd9684, %rd9683, 255; selp.b64 %rd9685, 2, %rd9684, %p2229; and.b64 %rd9686, %rd3978, 4294967040; or.b64 %rd9687, %rd9686, %rd9685; mov.b64 {%r3984, %r3985}, %rd9687; mov.b32 {%rs1036, %rs797}, %r3984; and.b16 %rs798, %rs1036, 255; setp.eq.s16 %p2230, %rs798, 2; mov.u32 %r5288, 2; mov.u32 %r5286, 0; mov.u32 %r5287, %r5286; @%p2230 bra $L__BB1_1653; ld.global.u8 %rs799, [%rd3470+64]; setp.eq.s16 %p2231, %rs799, 0; shr.u64 %rd9688, %rd3977, 32; cvt.u32.u64 %r3986, %rd9688; mov.b32 %f1185, %r3986; @%p2231 bra $L__BB1_1634; mov.b64 {%r3987, %r3988}, %rd3976; mov.b32 %f1187, %r3988; mov.b32 %f1186, %r3987; ld.global.u8 %rs159, [%rd3470+65]; setp.gt.f32 %p2233, %f1186, %f4281; setp.lt.f32 %p2234, %f1186, %f4283; or.pred %p2235, %p2234, %p2233; mov.pred %p2948, 0; @%p2235 bra $L__BB1_1633; setp.geu.f32 %p2236, %f1187, 0fFF7FFFFF; setp.leu.f32 %p2237, %f1187, 0f7F7FFFFF; and.pred %p2948, %p2237, %p2236; $L__BB1_1633: setp.ge.f32 %p2238, %f1146, %f1185; setp.le.f32 %p2239, %f1146, %f1185; setp.eq.s16 %p2240, %rs159, 0; selp.u32 %r3989, -1, 0, %p2238; selp.u32 %r3990, -1, 0, %p2239; selp.b32 %r3991, %r3990, %r3989, %p2240; and.b32 %r3992, %r3991, 1; setp.eq.b32 %p2241, %r3992, 1; and.pred %p2242, %p2241, %p2948; selp.u16 %rs1036, 1, 0, %p2242; $L__BB1_1634: cvt.u32.u64 %r3993, %rd3977; mov.b32 %f4337, %r3993; mul.f32 %f4338, %f1144, %f4337; mul.f32 %f4339, %f1143, %f1185; sub.f32 %f4340, %f4338, %f4339; mul.f32 %f4341, %f1144, %f1185; fma.rn.f32 %f4342, %f1143, %f4337, %f4341; add.f32 %f4343, %f1141, %f4340; mov.b32 %r3994, %f4343; add.f32 %f4344, %f1142, %f4342; mov.b32 %r3995, %f4344; cvt.u64.u32 %rd9689, %r3995; cvt.u64.u32 %rd9690, %r3994; cvt.u64.u16 %rd9691, %rs1036; bfi.b64 %rd9692, %rd9689, %rd9690, 32, 32; and.b64 %rd9693, %rd9691, 255; mov.b64 {%r5286, %r5287}, %rd9692; mov.b64 {%r5288, %r3996}, %rd9693; bra.uni $L__BB1_1653; $L__BB1_1452: setp.eq.s32 %p1999, %r5223, 0; @%p1999 bra $L__BB1_1465; setp.ne.s32 %p2000, %r5223, 1; @%p2000 bra $L__BB1_1478; add.s64 %rd3500, %rd12507, 1; or.b64 %rd9189, %rd3500, %rd3475; and.b64 %rd9190, %rd9189, -4294967296; setp.eq.s64 %p2001, %rd9190, 0; @%p2001 bra $L__BB1_1456; rem.u64 %rd12513, %rd3500, %rd3475; bra.uni $L__BB1_1457; $L__BB1_1465: setp.eq.s64 %p2008, %rd12507, 0; selp.b64 %rd3547, %rd3475, %rd12507, %p2008; add.s64 %rd9229, %rd3547, -1; setp.gt.u64 %p2009, %rd3475, %rd9229; @%p2009 bra $L__BB1_1467; bra.uni $L__BB1_1466; $L__BB1_1467: shl.b64 %rd9230, %rd3547, 3; add.s64 %rd9231, %rd3476, %rd9230; ld.u32 %rd9232, [%rd9231+-8]; ld.u32 %rd9233, [%rd9231+-4]; bfi.b64 %rd3548, %rd9233, %rd9232, 32, 32; or.b64 %rd9234, %rd3547, %rd3475; and.b64 %rd9235, %rd9234, -4294967296; setp.eq.s64 %p2010, %rd9235, 0; @%p2010 bra $L__BB1_1469; rem.u64 %rd12530, %rd3547, %rd3475; bra.uni $L__BB1_1470; $L__BB1_1583: ld.u32 %r3926, [%rd3613+76]; cvt.u64.u32 %rd9599, %r3926; setp.le.u64 %p2175, %rd3604, %rd9599; mul.wide.u32 %rd9600, %r3926, 12; add.s64 %rd9601, %rd3605, %rd9600; setp.eq.s64 %p2176, %rd9601, 0; or.pred %p2177, %p2175, %p2176; selp.b32 %r1122, %r1122, %r5242, %p2177; selp.b32 %r1121, %r1121, %r5241, %p2177; selp.b32 %r1120, %r1120, %r5240, %p2177; selp.b32 %r1124, %r1124, %r5255, %p2177; selp.b32 %r1125, %r1125, %r1174, %p2177; $L__BB1_1488: mov.b32 %f1076, %r1125; $L__BB1_1489: mov.u32 %r1126, %r1127; setp.eq.s32 %p2019, %r1126, 0; @%p2019 bra $L__BB1_1590; cvt.u64.u32 %rd9291, %r1126; add.s64 %rd9292, %rd9291, -1; cvt.u32.u64 %r1127, %rd9292; st.local.u32 [%rd3438+512], %r1127; mul.wide.u32 %rd9293, %r1126, 8; add.s64 %rd9294, %rd3438, %rd9293; ld.local.u32 %rd3611, [%rd9294+-4]; ld.local.u32 %rd9295, [%rd9294+-8]; shl.b64 %rd9296, %rd9295, 32; or.b64 %rd3610, %rd9296, 1; mov.b64 {%r3782, %r3783}, %rd3611; mov.b32 %f4085, %r3782; neg.f32 %f4086, %f4085; setp.le.f32 %p2020, %f1076, %f4086; @%p2020 bra $L__BB1_1489; mov.b64 {%r3784, %r3785}, %rd3610; cvt.u64.u32 %rd3612, %r3785; setp.gt.u64 %p2021, %rd3601, %rd3612; @%p2021 bra $L__BB1_1493; bra.uni $L__BB1_1492; $L__BB1_1493: mul.lo.s64 %rd9297, %rd3612, 96; add.s64 %rd3613, %rd3603, %rd9297; ld.u8 %rs734, [%rd3613+88]; and.b16 %rs735, %rs734, 1; setp.eq.b16 %p2023, %rs735, 1; mov.pred %p2946, 0; xor.pred %p2024, %p2023, %p2946; not.pred %p2025, %p2024; @%p2025 bra $L__BB1_1495; ld.v4.u32 {%r3786, %r3787, %r3788, %r3789}, [%rd3613+64]; cvt.u64.u32 %rd9298, %r3786; setp.gt.u64 %p2027, %rd3604, %rd9298; mul.wide.u32 %rd9299, %r3786, 12; add.s64 %rd9300, %rd3605, %rd9299; selp.b64 %rd9301, %rd9300, 0, %p2027; setp.eq.s64 %p2028, %rd9301, 0; add.s64 %rd9302, %rd9301, 8; selp.b64 %rd12551, 0, %rd9302, %p2028; cvt.u64.u32 %rd9303, %r3787; setp.gt.u64 %p2029, %rd3604, %rd9303; mul.wide.u32 %rd9304, %r3787, 12; add.s64 %rd9305, %rd3605, %rd9304; selp.b64 %rd9306, %rd9305, 0, %p2029; setp.eq.s64 %p2030, %rd9306, 0; add.s64 %rd9307, %rd9306, 8; selp.b64 %rd12550, 0, %rd9307, %p2030; ld.u32 %r3793, [%rd3613+72]; cvt.u64.u32 %rd9308, %r3793; setp.gt.u64 %p2031, %rd3604, %rd9308; mul.wide.u32 %rd9309, %r3793, 12; add.s64 %rd9310, %rd3605, %rd9309; selp.b64 %rd9311, %rd9310, 0, %p2031; setp.eq.s64 %p2032, %rd9311, 0; add.s64 %rd9312, %rd9311, 8; selp.b64 %rd12549, 0, %rd9312, %p2032; cvt.u64.u32 %rd9313, %r3789; setp.gt.u64 %p2033, %rd3604, %rd9313; mul.wide.u32 %rd9314, %r3789, 12; add.s64 %rd9315, %rd3605, %rd9314; selp.b64 %rd9316, %rd9315, 0, %p2033; setp.eq.s64 %p2034, %rd9316, 0; add.s64 %rd9317, %rd9316, 8; selp.b64 %rd12548, 0, %rd9317, %p2034; mov.pred %p2946, -1; $L__BB1_1495: ld.v4.f32 {%f4087, %f4088, %f4089, %f4090}, [%rd3613]; sub.f32 %f4095, %f4087, %f1074; sub.f32 %f4096, %f4088, %f1074; sub.f32 %f4097, %f4089, %f1074; sub.f32 %f4098, %f4090, %f1074; ld.v4.f32 {%f4099, %f4100, %f4101, %f4102}, [%rd3613+16]; sub.f32 %f4107, %f4099, %f1075; sub.f32 %f4108, %f4100, %f1075; sub.f32 %f4109, %f4101, %f1075; sub.f32 %f4110, %f4102, %f1075; ld.v4.f32 {%f4111, %f4112, %f4113, %f4114}, [%rd3613+32]; sub.f32 %f4119, %f1074, %f4111; sub.f32 %f4120, %f1074, %f4112; sub.f32 %f4121, %f1074, %f4113; sub.f32 %f4122, %f1074, %f4114; ld.v4.f32 {%f4123, %f4124, %f4125, %f4126}, [%rd3613+48]; sub.f32 %f4131, %f1075, %f4123; sub.f32 %f4132, %f1075, %f4124; sub.f32 %f4133, %f1075, %f4125; sub.f32 %f4134, %f1075, %f4126; setp.ge.f32 %p2035, %f4095, %f4119; selp.f32 %f4135, %f4095, %f4119, %p2035; setp.ge.f32 %p2036, %f4096, %f4120; selp.f32 %f4136, %f4096, %f4120, %p2036; setp.ge.f32 %p2037, %f4097, %f4121; selp.f32 %f4137, %f4097, %f4121, %p2037; setp.ge.f32 %p2038, %f4098, %f4122; selp.f32 %f4138, %f4098, %f4122, %p2038; setp.ge.f32 %p2039, %f4107, %f4131; selp.f32 %f4139, %f4107, %f4131, %p2039; setp.ge.f32 %p2040, %f4108, %f4132; selp.f32 %f4140, %f4108, %f4132, %p2040; setp.ge.f32 %p2041, %f4109, %f4133; selp.f32 %f4141, %f4109, %f4133, %p2041; setp.ge.f32 %p2042, %f4110, %f4134; selp.f32 %f4142, %f4110, %f4134, %p2042; setp.ge.f32 %p2043, %f4135, 0f00000000; selp.f32 %f4143, %f4135, 0f00000000, %p2043; setp.ge.f32 %p2044, %f4136, 0f00000000; selp.f32 %f4144, %f4136, 0f00000000, %p2044; setp.ge.f32 %p2045, %f4137, 0f00000000; selp.f32 %f4145, %f4137, 0f00000000, %p2045; setp.ge.f32 %p2046, %f4138, 0f00000000; selp.f32 %f4146, %f4138, 0f00000000, %p2046; mov.b32 %r3794, %f4143; mov.b32 %r3795, %f4144; mov.b32 %r3796, %f4145; mov.b32 %r3797, %f4146; cvt.u64.u32 %rd9318, %r3797; cvt.u64.u32 %rd9319, %r3795; cvt.u64.u32 %rd9320, %r3794; cvt.u64.u32 %rd9321, %r3796; bfi.b64 %rd9322, %rd9318, %rd9321, 32, 32; bfi.b64 %rd9323, %rd9319, %rd9320, 32, 32; setp.ge.f32 %p2047, %f4139, 0f00000000; selp.f32 %f4147, %f4139, 0f00000000, %p2047; setp.ge.f32 %p2048, %f4140, 0f00000000; selp.f32 %f4148, %f4140, 0f00000000, %p2048; setp.ge.f32 %p2049, %f4141, 0f00000000; selp.f32 %f4149, %f4141, 0f00000000, %p2049; setp.ge.f32 %p2050, %f4142, 0f00000000; selp.f32 %f4150, %f4142, 0f00000000, %p2050; mov.b32 %r3798, %f4147; mov.b32 %r3799, %f4148; mov.b32 %r3800, %f4149; mov.b32 %r3801, %f4150; cvt.u64.u32 %rd9324, %r3801; cvt.u64.u32 %rd9325, %r3799; cvt.u64.u32 %rd9326, %r3798; cvt.u64.u32 %rd9327, %r3800; bfi.b64 %rd9328, %rd9324, %rd9327, 32, 32; bfi.b64 %rd9329, %rd9325, %rd9326, 32, 32; mov.b64 {%r3802, %r3803}, %rd9323; mov.b64 {%r3804, %r3805}, %rd9322; cvt.u64.u32 %rd9330, %r3805; cvt.u64.u32 %rd9331, %r3803; cvt.u64.u32 %rd9332, %r3804; bfi.b64 %rd9333, %rd9330, %rd9332, 32, 32; mov.b64 {%r3806, %r3807}, %rd9333; bfi.b64 %rd9334, %rd9331, %rd9320, 32, 32; mov.b64 {%r3808, %r3809}, %rd9334; mov.b32 %f4151, %r3808; mov.b32 %f4152, %r3809; mov.b32 %f4153, %r3806; mov.b32 %f4154, %r3807; mov.b32 %f4155, %r3802; mov.b32 %f4156, %r3803; mov.b32 %f4157, %r3804; mov.b32 %f4158, %r3805; mov.b64 {%r3810, %r3811}, %rd9329; mov.b64 {%r3812, %r3813}, %rd9328; cvt.u64.u32 %rd9335, %r3813; cvt.u64.u32 %rd9336, %r3811; cvt.u64.u32 %rd9337, %r3812; bfi.b64 %rd9338, %rd9335, %rd9337, 32, 32; mov.b64 {%r3814, %r3815}, %rd9338; bfi.b64 %rd9339, %rd9336, %rd9326, 32, 32; mov.b64 {%r3816, %r3817}, %rd9339; mov.b32 %f4159, %r3816; mov.b32 %f4160, %r3817; mov.b32 %f4161, %r3814; mov.b32 %f4162, %r3815; mov.b32 %f4163, %r3810; mov.b32 %f4164, %r3811; mov.b32 %f4165, %r3812; mov.b32 %f4166, %r3813; mul.f32 %f4167, %f4163, %f4159; mul.f32 %f4168, %f4164, %f4160; mul.f32 %f4169, %f4165, %f4161; mul.f32 %f4170, %f4166, %f4162; fma.rn.f32 %f4171, %f4155, %f4151, %f4167; fma.rn.f32 %f4172, %f4156, %f4152, %f4168; fma.rn.f32 %f4173, %f4157, %f4153, %f4169; fma.rn.f32 %f4174, %f4158, %f4154, %f4170; add.f32 %f4175, %f4171, 0f00000000; add.f32 %f4176, %f4172, 0f00000000; add.f32 %f4177, %f4173, 0f00000000; add.f32 %f4178, %f4174, 0f00000000; sqrt.rn.f32 %f4179, %f4175; sqrt.rn.f32 %f4180, %f4176; sqrt.rn.f32 %f4181, %f4177; sqrt.rn.f32 %f4182, %f4178; mov.b32 %r3818, %f4179; mov.b32 %r3819, %f4180; mov.b32 %r3820, %f4181; mov.b32 %r3821, %f4182; cvt.u64.u32 %rd9340, %r3821; cvt.u64.u32 %rd9341, %r3819; cvt.u64.u32 %rd9342, %r3818; cvt.u64.u32 %rd9343, %r3820; bfi.b64 %rd12657, %rd9340, %rd9343, 32, 32; mov.b64 {%r3822, %r3823}, %rd12657; bfi.b64 %rd12656, %rd9341, %rd9342, 32, 32; mov.b64 {%r3824, %r3825}, %rd12656; mov.b32 %f4183, %r3824; mov.b32 %f4184, %r3825; mov.b32 %f4185, %r3822; mov.b32 %f4186, %r3823; setp.lt.f32 %p2051, %f4183, %f1076; setp.lt.f32 %p2052, %f4184, %f1076; setp.lt.f32 %p2053, %f4185, %f1076; setp.lt.f32 %p2054, %f4186, %f1076; selp.u32 %r3826, 1, 0, %p2051; selp.u32 %r3827, -1, 0, %p2052; bfi.b32 %r3828, %r3827, %r3826, 8, 1; selp.u32 %r3829, -1, 0, %p2053; bfi.b32 %r3830, %r3829, %r3828, 16, 1; selp.u32 %r3831, -1, 0, %p2054; bfi.b32 %r3832, %r3831, %r3830, 24, 1; cvt.u64.u32 %rd9344, %r3832; mov.b64 {%r3833, %r3834}, %rd9344; mov.b32 {%rs736, %rs737}, %r3833; and.b16 %rs738, %rs736, 1; shr.u16 %rs739, %rs736, 7; and.b16 %rs740, %rs739, 2; or.b16 %rs741, %rs740, %rs738; shl.b16 %rs742, %rs737, 2; and.b16 %rs743, %rs742, 4; or.b16 %rs744, %rs741, %rs743; shr.u16 %rs745, %rs737, 5; and.b16 %rs746, %rs745, 8; or.b16 %rs747, %rs744, %rs746; cvt.u64.u16 %rd3624, %rs747; @%p2946 bra $L__BB1_1497; bra.uni $L__BB1_1496; $L__BB1_1497: mov.u64 %rd9345, 1; st.local.v2.u64 [%rd8], {%rd12551, %rd12550}; st.local.v2.u64 [%rd8+16], {%rd12549, %rd12548}; mov.f32 %f4187, 0f00000000; st.local.v4.f32 [%rd24], {%f4187, %f4187, %f4187, %f4187}; mov.u32 %r3845, 4; st.local.u32 [%rd3434+16], %r3845; st.local.u32 [%rd3434+52], %r3845; st.local.u32 [%rd3434+88], %r3845; st.local.u32 [%rd3434+124], %r3845; mov.u64 %rd3629, %rd9345; $L__BB1_1498: add.s64 %rd9346, %rd3629, -1; cvt.u32.u64 %r3846, %rd9346; shl.b64 %rd9348, %rd9345, %r3846; and.b64 %rd9349, %rd9348, %rd3624; setp.eq.s64 %p2055, %rd9349, 0; @%p2055 bra $L__BB1_1551; shl.b64 %rd9350, %rd3629, 3; add.s64 %rd9351, %rd8, %rd9350; ld.local.u64 %rd3630, [%rd9351+-8]; setp.eq.s64 %p2056, %rd3630, 0; @%p2056 bra $L__BB1_1551; ld.u32 %r1128, [%rd3630]; cvt.u64.u32 %rd3631, %r1128; ld.global.u64 %rd9352, [%rd3470+112]; setp.gt.u64 %p2057, %rd9352, %rd3631; @%p2057 bra $L__BB1_1502; bra.uni $L__BB1_1501; $L__BB1_1502: ld.global.u64 %rd9353, [%rd3470+104]; mul.lo.s64 %rd9354, %rd3631, 12; add.s64 %rd3632, %rd9353, %rd9354; ld.u32 %rd3633, [%rd3632+8]; ld.u32 %rd3634, [%rd3632]; ld.global.u64 %rd3635, [%rd3470+96]; setp.gt.u64 %p2058, %rd3635, %rd3634; @%p2058 bra $L__BB1_1504; bra.uni $L__BB1_1503; $L__BB1_1504: ld.global.u64 %rd3636, [%rd3470+88]; shl.b64 %rd9355, %rd3634, 3; add.s64 %rd9356, %rd3636, %rd9355; ld.u32 %rd9357, [%rd9356]; ld.u32 %rd9358, [%rd9356+4]; bfi.b64 %rd3637, %rd9358, %rd9357, 32, 32; ld.u32 %rd3638, [%rd3632+4]; setp.gt.u64 %p2059, %rd3635, %rd3638; @%p2059 bra $L__BB1_1506; bra.uni $L__BB1_1505; $L__BB1_1506: setp.gt.u64 %p2060, %rd3635, %rd3633; @%p2060 bra $L__BB1_1508; bra.uni $L__BB1_1507; $L__BB1_1508: shl.b64 %rd9359, %rd3638, 3; add.s64 %rd9360, %rd3636, %rd9359; shl.b64 %rd9361, %rd3633, 3; add.s64 %rd9362, %rd3636, %rd9361; cvt.u32.u64 %r3847, %rd3637; mov.b32 %f1077, %r3847; shr.u64 %rd9363, %rd3637, 32; cvt.u32.u64 %r3848, %rd9363; mov.b32 %f1078, %r3848; ld.u32 %rd9364, [%rd9360]; ld.u32 %rd9365, [%rd9360+4]; bfi.b64 %rd3639, %rd9365, %rd9364, 32, 32; cvt.u32.u64 %r3849, %rd3639; shr.u64 %rd9366, %rd3639, 32; cvt.u32.u64 %r3850, %rd9366; mov.b32 %f1079, %r3849; sub.f32 %f1080, %f1079, %f1077; mov.b32 %f5442, %r3850; sub.f32 %f1082, %f5442, %f1078; ld.u32 %rd9367, [%rd9362]; ld.u32 %rd9368, [%rd9362+4]; bfi.b64 %rd3640, %rd9368, %rd9367, 32, 32; cvt.u32.u64 %r3851, %rd3640; shr.u64 %rd9369, %rd3640, 32; cvt.u32.u64 %r3852, %rd9369; mov.b32 %f1083, %r3851; sub.f32 %f1084, %f1083, %f1077; mov.b32 %f1085, %r3852; sub.f32 %f1086, %f1085, %f1078; sub.f32 %f1087, %f1074, %f1077; sub.f32 %f1088, %f1075, %f1078; mul.f32 %f4188, %f1088, %f1082; fma.rn.f32 %f1089, %f1087, %f1080, %f4188; mul.f32 %f4189, %f1088, %f1086; fma.rn.f32 %f1090, %f1087, %f1084, %f4189; setp.le.f32 %p2061, %f1089, 0f00000000; setp.le.f32 %p2062, %f1090, 0f00000000; and.pred %p2063, %p2061, %p2062; @%p2063 bra $L__BB1_1546; bra.uni $L__BB1_1509; $L__BB1_1546: add.u64 %rd12642, %SP, 552; add.u64 %rd12648, %SP, 0; st.local.u64 [%rd3442], %rd3637; mov.u64 %rd12653, 2; mov.u64 %rd12639, %rd3455; mov.u64 %rd12640, %rd3454; mov.u64 %rd12641, %rd3454; mov.u64 %rd12643, %rd3454; mov.u64 %rd12644, %rd3454; mov.u64 %rd12645, %rd12642; mov.u64 %rd12646, %rd3442; mov.u64 %rd12647, %rd3442; mov.u64 %rd12649, %rd3442; mov.u64 %rd12650, %rd3442; mov.u64 %rd12651, %rd12648; mov.u64 %rd12652, %rd3443; $L__BB1_1547: setp.eq.s64 %p2116, %rd12653, 0; mov.u64 %rd12654, 1; @%p2116 bra $L__BB1_1549; add.s64 %rd12653, %rd12653, -1; add.s64 %rd9514, %rd12640, 8; setp.eq.s64 %p2117, %rd12643, %rd12639; selp.b64 %rd9515, %rd9514, %rd12643, %p2117; add.s64 %rd9516, %rd12641, 8; selp.b64 %rd9517, %rd9516, %rd12644, %p2117; add.s64 %rd9518, %rd12642, 8; selp.b64 %rd9519, %rd9518, %rd12645, %p2117; mov.u64 %rd12654, 0; setp.eq.s64 %p2118, %rd12653, 0; add.s64 %rd9520, %rd9515, 4; add.s64 %rd9521, %rd9517, 4; add.s64 %rd9522, %rd9519, 4; selp.b64 %rd3866, %rd9515, %rd9520, %p2118; selp.b64 %rd12644, %rd9517, %rd9521, %p2118; selp.b64 %rd12645, %rd9519, %rd9522, %p2118; selp.b64 %rd12640, %rd9514, %rd12640, %p2117; selp.b64 %rd12641, %rd9516, %rd12641, %p2117; selp.b64 %rd12642, %rd9518, %rd12642, %p2117; add.s64 %rd9523, %rd12643, 8; selp.b64 %rd12639, %rd9523, %rd12639, %p2117; add.s64 %rd9524, %rd12649, 8; setp.eq.s64 %p2119, %rd12646, %rd12652; selp.b64 %rd9525, %rd9524, %rd12646, %p2119; add.s64 %rd9526, %rd12650, 8; selp.b64 %rd9527, %rd9526, %rd12647, %p2119; add.s64 %rd9528, %rd12651, 8; selp.b64 %rd9529, %rd9528, %rd12648, %p2119; selp.b64 %rd12649, %rd9524, %rd12649, %p2119; selp.b64 %rd12650, %rd9526, %rd12650, %p2119; selp.b64 %rd12651, %rd9528, %rd12651, %p2119; add.s64 %rd9530, %rd12646, 8; selp.b64 %rd12652, %rd9530, %rd12652, %p2119; add.s64 %rd9531, %rd9525, 4; add.s64 %rd9532, %rd9527, 4; add.s64 %rd9533, %rd9529, 4; selp.b64 %rd12646, %rd9525, %rd9531, %p2118; selp.b64 %rd12647, %rd9527, %rd9532, %p2118; selp.b64 %rd12648, %rd9529, %rd9533, %p2118; ld.local.f32 %f4255, [%rd9527]; ld.local.f32 %f4256, [%rd9517]; setp.eq.f32 %p2120, %f4256, %f4255; mov.u64 %rd12643, %rd3866; @%p2120 bra $L__BB1_1547; $L__BB1_1549: mov.u64 %rd11219, 0; or.b64 %rd9535, %rd11219, %rd3637; mov.b64 {%r3894, %r3895}, %rd9535; mov.b64 {%r3896, %r3897}, %rd12654; cvt.u32.u64 %r3899, %rd11219; or.b32 %r5237, %r3899, %r3847; mov.u32 %r5238, 0; mov.b32 %f5446, %r3895; mov.b32 {%rs1035, %rs766}, %r3896; mov.u32 %r5239, %r5238; bra.uni $L__BB1_1550; $L__BB1_1509: sub.f32 %f1091, %f1074, %f1079; sub.f32 %f1092, %f1075, %f5442; mul.f32 %f4190, %f1082, %f1092; fma.rn.f32 %f1093, %f1080, %f1091, %f4190; mul.f32 %f4191, %f1092, %f1086; fma.rn.f32 %f1094, %f1091, %f1084, %f4191; setp.ge.f32 %p2064, %f1093, 0f00000000; setp.le.f32 %p2065, %f1094, %f1093; and.pred %p2066, %p2064, %p2065; @%p2066 bra $L__BB1_1542; bra.uni $L__BB1_1510; $L__BB1_1542: add.u64 %rd12626, %SP, 552; add.u64 %rd12632, %SP, 0; st.local.u64 [%rd3442], %rd3639; mov.u64 %rd12637, 2; mov.u64 %rd12623, %rd3455; mov.u64 %rd12624, %rd3454; mov.u64 %rd12625, %rd3454; mov.u64 %rd12627, %rd3454; mov.u64 %rd12628, %rd3454; mov.u64 %rd12629, %rd12626; mov.u64 %rd12630, %rd3442; mov.u64 %rd12631, %rd3442; mov.u64 %rd12633, %rd3442; mov.u64 %rd12634, %rd3442; mov.u64 %rd12635, %rd12632; mov.u64 %rd12636, %rd3445; $L__BB1_1543: setp.eq.s64 %p2111, %rd12637, 0; mov.u64 %rd12638, 1; @%p2111 bra $L__BB1_1545; add.s64 %rd12637, %rd12637, -1; add.s64 %rd9487, %rd12624, 8; setp.eq.s64 %p2112, %rd12627, %rd12623; selp.b64 %rd9488, %rd9487, %rd12627, %p2112; add.s64 %rd9489, %rd12625, 8; selp.b64 %rd9490, %rd9489, %rd12628, %p2112; add.s64 %rd9491, %rd12626, 8; selp.b64 %rd9492, %rd9491, %rd12629, %p2112; mov.u64 %rd12638, 0; setp.eq.s64 %p2113, %rd12637, 0; add.s64 %rd9493, %rd9488, 4; add.s64 %rd9494, %rd9490, 4; add.s64 %rd9495, %rd9492, 4; selp.b64 %rd3828, %rd9488, %rd9493, %p2113; selp.b64 %rd12628, %rd9490, %rd9494, %p2113; selp.b64 %rd12629, %rd9492, %rd9495, %p2113; selp.b64 %rd12624, %rd9487, %rd12624, %p2112; selp.b64 %rd12625, %rd9489, %rd12625, %p2112; selp.b64 %rd12626, %rd9491, %rd12626, %p2112; add.s64 %rd9496, %rd12627, 8; selp.b64 %rd12623, %rd9496, %rd12623, %p2112; add.s64 %rd9497, %rd12633, 8; setp.eq.s64 %p2114, %rd12630, %rd12636; selp.b64 %rd9498, %rd9497, %rd12630, %p2114; add.s64 %rd9499, %rd12634, 8; selp.b64 %rd9500, %rd9499, %rd12631, %p2114; add.s64 %rd9501, %rd12635, 8; selp.b64 %rd9502, %rd9501, %rd12632, %p2114; selp.b64 %rd12633, %rd9497, %rd12633, %p2114; selp.b64 %rd12634, %rd9499, %rd12634, %p2114; selp.b64 %rd12635, %rd9501, %rd12635, %p2114; add.s64 %rd9503, %rd12630, 8; selp.b64 %rd12636, %rd9503, %rd12636, %p2114; add.s64 %rd9504, %rd9498, 4; add.s64 %rd9505, %rd9500, 4; add.s64 %rd9506, %rd9502, 4; selp.b64 %rd12630, %rd9498, %rd9504, %p2113; selp.b64 %rd12631, %rd9500, %rd9505, %p2113; selp.b64 %rd12632, %rd9502, %rd9506, %p2113; ld.local.f32 %f4253, [%rd9500]; ld.local.f32 %f4254, [%rd9490]; setp.eq.f32 %p2115, %f4254, %f4253; mov.u64 %rd12627, %rd3828; @%p2115 bra $L__BB1_1543; $L__BB1_1545: mov.u64 %rd11218, 0; or.b64 %rd9508, %rd11218, %rd3639; mov.b64 {%r3886, %r3887}, %rd9508; mov.b64 {%r3888, %r3889}, %rd12638; cvt.u32.u64 %r3891, %rd11218; or.b32 %r5237, %r3891, %r3849; mov.u32 %r5238, 0; mov.b32 %f5446, %r3887; mov.u32 %r5239, 1; mov.b32 {%rs1035, %rs762}, %r3888; bra.uni $L__BB1_1550; $L__BB1_1510: sub.f32 %f1095, %f1074, %f1083; sub.f32 %f1096, %f1075, %f1085; mul.f32 %f4192, %f1082, %f1096; fma.rn.f32 %f1097, %f1080, %f1095, %f4192; mul.f32 %f4193, %f1086, %f1096; fma.rn.f32 %f1098, %f1084, %f1095, %f4193; setp.ge.f32 %p2067, %f1098, 0f00000000; setp.le.f32 %p2068, %f1097, %f1098; and.pred %p2069, %p2068, %p2067; @%p2069 bra $L__BB1_1538; bra.uni $L__BB1_1511; $L__BB1_1538: add.u64 %rd12610, %SP, 552; add.u64 %rd12616, %SP, 0; st.local.u64 [%rd3442], %rd3640; mov.u64 %rd12621, 2; mov.u64 %rd12607, %rd3455; mov.u64 %rd12608, %rd3454; mov.u64 %rd12609, %rd3454; mov.u64 %rd12611, %rd3454; mov.u64 %rd12612, %rd3454; mov.u64 %rd12613, %rd12610; mov.u64 %rd12614, %rd3442; mov.u64 %rd12615, %rd3442; mov.u64 %rd12617, %rd3442; mov.u64 %rd12618, %rd3442; mov.u64 %rd12619, %rd12616; mov.u64 %rd12620, %rd3447; $L__BB1_1539: setp.eq.s64 %p2106, %rd12621, 0; mov.u64 %rd12622, 1; @%p2106 bra $L__BB1_1541; add.s64 %rd12621, %rd12621, -1; add.s64 %rd9460, %rd12608, 8; setp.eq.s64 %p2107, %rd12611, %rd12607; selp.b64 %rd9461, %rd9460, %rd12611, %p2107; add.s64 %rd9462, %rd12609, 8; selp.b64 %rd9463, %rd9462, %rd12612, %p2107; add.s64 %rd9464, %rd12610, 8; selp.b64 %rd9465, %rd9464, %rd12613, %p2107; mov.u64 %rd12622, 0; setp.eq.s64 %p2108, %rd12621, 0; add.s64 %rd9466, %rd9461, 4; add.s64 %rd9467, %rd9463, 4; add.s64 %rd9468, %rd9465, 4; selp.b64 %rd3790, %rd9461, %rd9466, %p2108; selp.b64 %rd12612, %rd9463, %rd9467, %p2108; selp.b64 %rd12613, %rd9465, %rd9468, %p2108; selp.b64 %rd12608, %rd9460, %rd12608, %p2107; selp.b64 %rd12609, %rd9462, %rd12609, %p2107; selp.b64 %rd12610, %rd9464, %rd12610, %p2107; add.s64 %rd9469, %rd12611, 8; selp.b64 %rd12607, %rd9469, %rd12607, %p2107; add.s64 %rd9470, %rd12617, 8; setp.eq.s64 %p2109, %rd12614, %rd12620; selp.b64 %rd9471, %rd9470, %rd12614, %p2109; add.s64 %rd9472, %rd12618, 8; selp.b64 %rd9473, %rd9472, %rd12615, %p2109; add.s64 %rd9474, %rd12619, 8; selp.b64 %rd9475, %rd9474, %rd12616, %p2109; selp.b64 %rd12617, %rd9470, %rd12617, %p2109; selp.b64 %rd12618, %rd9472, %rd12618, %p2109; selp.b64 %rd12619, %rd9474, %rd12619, %p2109; add.s64 %rd9476, %rd12614, 8; selp.b64 %rd12620, %rd9476, %rd12620, %p2109; add.s64 %rd9477, %rd9471, 4; add.s64 %rd9478, %rd9473, 4; add.s64 %rd9479, %rd9475, 4; selp.b64 %rd12614, %rd9471, %rd9477, %p2108; selp.b64 %rd12615, %rd9473, %rd9478, %p2108; selp.b64 %rd12616, %rd9475, %rd9479, %p2108; ld.local.f32 %f4251, [%rd9473]; ld.local.f32 %f4252, [%rd9463]; setp.eq.f32 %p2110, %f4252, %f4251; mov.u64 %rd12611, %rd3790; @%p2110 bra $L__BB1_1539; $L__BB1_1541: mov.u64 %rd11217, 0; or.b64 %rd9481, %rd11217, %rd3640; mov.b64 {%r3878, %r3879}, %rd9481; mov.b64 {%r3880, %r3881}, %rd12622; cvt.u32.u64 %r3883, %rd11217; or.b32 %r5237, %r3883, %r3851; mov.u32 %r5238, 0; mov.b32 %f5446, %r3879; mov.b32 {%rs1035, %rs758}, %r3880; mov.u32 %r5239, 2; bra.uni $L__BB1_1550; $L__BB1_1511: sub.f32 %f1099, %f1083, %f1079; sub.f32 %f1100, %f1085, %f5442; mul.f32 %f4194, %f1082, %f1084; mul.f32 %f4195, %f1080, %f1086; sub.f32 %f1101, %f4195, %f4194; mul.f32 %f4196, %f1087, %f1082; mul.f32 %f4197, %f1088, %f1080; sub.f32 %f4198, %f4197, %f4196; mul.f32 %f4199, %f4198, %f1101; setp.lt.f32 %p2070, %f4199, 0f00000000; setp.ge.f32 %p2071, %f1089, 0f00000000; and.pred %p2072, %p2071, %p2070; setp.le.f32 %p2073, %f1093, 0f00000000; and.pred %p2074, %p2073, %p2072; mov.u16 %rs1034, 0; @%p2074 bra $L__BB1_1514; mul.f32 %f4200, %f1084, %f1096; mul.f32 %f4201, %f1095, %f1086; sub.f32 %f4202, %f4200, %f4201; mul.f32 %f4203, %f1101, %f4202; setp.gt.f32 %p2075, %f4203, 0f80000000; setp.ge.f32 %p2076, %f1090, 0f00000000; and.pred %p2077, %p2076, %p2075; setp.le.f32 %p2078, %f1098, 0f00000000; and.pred %p2079, %p2078, %p2077; mov.u16 %rs1034, 1; @%p2079 bra $L__BB1_1514; mul.f32 %f4204, %f1099, %f1092; mul.f32 %f4205, %f1091, %f1100; sub.f32 %f4206, %f4204, %f4205; mul.f32 %f4207, %f1101, %f4206; setp.lt.f32 %p2080, %f4207, 0f00000000; sub.f32 %f4208, %f1094, %f1093; setp.ge.f32 %p2081, %f4208, 0f00000000; and.pred %p2082, %p2081, %p2080; sub.f32 %f4209, %f1097, %f1098; setp.ge.f32 %p2083, %f4209, 0f00000000; and.pred %p2084, %p2083, %p2082; selp.b16 %rs1034, 2, 3, %p2084; $L__BB1_1514: mul.f32 %f4210, %f1082, %f1082; fma.rn.f32 %f4211, %f1080, %f1080, %f4210; add.f32 %f1102, %f4211, 0f00000000; mul.f32 %f4212, %f1086, %f1086; fma.rn.f32 %f4213, %f1084, %f1084, %f4212; add.f32 %f1103, %f4213, 0f00000000; mul.f32 %f4214, %f1100, %f1100; fma.rn.f32 %f4215, %f1099, %f1099, %f4214; add.f32 %f1104, %f4215, 0f00000000; setp.eq.s16 %p2085, %rs1034, 1; @%p2085 bra $L__BB1_1529; setp.eq.s16 %p2086, %rs1034, 2; @%p2086 bra $L__BB1_1525; setp.ne.s16 %p2087, %rs1034, 3; @%p2087 bra $L__BB1_1533; sub.f32 %f4216, %f1089, %f1093; div.rn.f32 %f1105, %f1089, %f4216; sub.f32 %f4217, %f1090, %f1098; div.rn.f32 %f1106, %f1090, %f4217; sub.f32 %f4218, %f1094, %f1093; add.f32 %f4219, %f1097, %f4218; sub.f32 %f4220, %f4219, %f1098; div.rn.f32 %f5444, %f4218, %f4220; mul.f32 %f4221, %f1088, %f1088; fma.rn.f32 %f4222, %f1087, %f1087, %f4221; add.f32 %f4223, %f4222, 0f00000000; mul.f32 %f4224, %f1102, %f1105; mul.f32 %f4225, %f1105, %f4224; sub.f32 %f1108, %f4223, %f4225; mul.f32 %f4226, %f1103, %f5444; mul.f32 %f4227, %f5444, %f4226; sub.f32 %f1109, %f4223, %f4227; mul.f32 %f4228, %f1092, %f1092; fma.rn.f32 %f4229, %f1091, %f1091, %f4228; add.f32 %f4230, %f4229, 0f00000000; mul.f32 %f4231, %f1104, %f1106; mul.f32 %f4232, %f1106, %f4231; sub.f32 %f1110, %f4230, %f4232; setp.lt.f32 %p2088, %f1108, %f1109; @%p2088 bra $L__BB1_1521; bra.uni $L__BB1_1518; $L__BB1_1521: setp.lt.f32 %p2090, %f1108, %f1110; @%p2090 bra $L__BB1_1523; bra.uni $L__BB1_1522; $L__BB1_1523: mul.f32 %f5443, %f1082, %f1105; fma.rn.f32 %f5441, %f1080, %f1105, %f1077; mov.u32 %r5239, 0; mov.f32 %f5442, %f1078; mov.f32 %f5444, %f1105; bra.uni $L__BB1_1524; $L__BB1_1525: add.u64 %rd12560, %SP, 552; add.u64 %rd12566, %SP, 0; mul.f32 %f4235, %f1092, %f1100; fma.rn.f32 %f4236, %f1091, %f1099, %f4235; div.rn.f32 %f5445, %f4236, %f1104; fma.rn.f32 %f4237, %f1099, %f5445, %f1079; mov.b32 %r3860, %f4237; fma.rn.f32 %f4238, %f1100, %f5445, %f5442; mov.b32 %r3861, %f4238; cvt.u64.u32 %rd9373, %r3861; cvt.u64.u32 %rd9374, %r3860; bfi.b64 %rd3648, %rd9373, %rd9374, 32, 32; st.local.u64 [%rd3442], %rd3648; mov.u64 %rd12571, 2; mov.u64 %rd12557, %rd3455; mov.u64 %rd12558, %rd3454; mov.u64 %rd12559, %rd3454; mov.u64 %rd12561, %rd3454; mov.u64 %rd12562, %rd3454; mov.u64 %rd12563, %rd12560; mov.u64 %rd12564, %rd3442; mov.u64 %rd12565, %rd3442; mov.u64 %rd12567, %rd3442; mov.u64 %rd12568, %rd3442; mov.u64 %rd12569, %rd12566; mov.u64 %rd12570, %rd3453; $L__BB1_1526: setp.eq.s64 %p2091, %rd12571, 0; mov.u64 %rd12606, 1; @%p2091 bra $L__BB1_1528; add.s64 %rd12571, %rd12571, -1; add.s64 %rd9379, %rd12558, 8; setp.eq.s64 %p2092, %rd12561, %rd12557; selp.b64 %rd9380, %rd9379, %rd12561, %p2092; add.s64 %rd9381, %rd12559, 8; selp.b64 %rd9382, %rd9381, %rd12562, %p2092; add.s64 %rd9383, %rd12560, 8; selp.b64 %rd9384, %rd9383, %rd12563, %p2092; mov.u64 %rd12606, 0; setp.eq.s64 %p2093, %rd12571, 0; add.s64 %rd9385, %rd9380, 4; add.s64 %rd9386, %rd9382, 4; add.s64 %rd9387, %rd9384, 4; selp.b64 %rd3665, %rd9380, %rd9385, %p2093; selp.b64 %rd12562, %rd9382, %rd9386, %p2093; selp.b64 %rd12563, %rd9384, %rd9387, %p2093; selp.b64 %rd12558, %rd9379, %rd12558, %p2092; selp.b64 %rd12559, %rd9381, %rd12559, %p2092; selp.b64 %rd12560, %rd9383, %rd12560, %p2092; add.s64 %rd9388, %rd12561, 8; selp.b64 %rd12557, %rd9388, %rd12557, %p2092; add.s64 %rd9389, %rd12567, 8; setp.eq.s64 %p2094, %rd12564, %rd12570; selp.b64 %rd9390, %rd9389, %rd12564, %p2094; add.s64 %rd9391, %rd12568, 8; selp.b64 %rd9392, %rd9391, %rd12565, %p2094; add.s64 %rd9393, %rd12569, 8; selp.b64 %rd9394, %rd9393, %rd12566, %p2094; selp.b64 %rd12567, %rd9389, %rd12567, %p2094; selp.b64 %rd12568, %rd9391, %rd12568, %p2094; selp.b64 %rd12569, %rd9393, %rd12569, %p2094; add.s64 %rd9395, %rd12564, 8; selp.b64 %rd12570, %rd9395, %rd12570, %p2094; add.s64 %rd9396, %rd9390, 4; add.s64 %rd9397, %rd9392, 4; add.s64 %rd9398, %rd9394, 4; selp.b64 %rd12564, %rd9390, %rd9396, %p2093; selp.b64 %rd12565, %rd9392, %rd9397, %p2093; selp.b64 %rd12566, %rd9394, %rd9398, %p2093; ld.local.f32 %f4239, [%rd9392]; ld.local.f32 %f4240, [%rd9382]; setp.eq.f32 %p2095, %f4240, %f4239; mov.u64 %rd12561, %rd3665; @%p2095 bra $L__BB1_1526; $L__BB1_1528: mov.u64 %rd11214, 0; or.b64 %rd12605, %rd11214, %rd3648; mov.u32 %r5239, 1; bra.uni $L__BB1_1537; $L__BB1_1529: add.u64 %rd12576, %SP, 552; add.u64 %rd12582, %SP, 0; div.rn.f32 %f5445, %f1090, %f1103; fma.rn.f32 %f4241, %f1084, %f5445, %f1077; mov.b32 %r3863, %f4241; fma.rn.f32 %f4242, %f1086, %f5445, %f1078; mov.b32 %r3864, %f4242; cvt.u64.u32 %rd9400, %r3864; cvt.u64.u32 %rd9401, %r3863; bfi.b64 %rd3689, %rd9400, %rd9401, 32, 32; st.local.u64 [%rd3442], %rd3689; mov.u64 %rd12587, 2; mov.u64 %rd12573, %rd3455; mov.u64 %rd12574, %rd3454; mov.u64 %rd12575, %rd3454; mov.u64 %rd12577, %rd3454; mov.u64 %rd12578, %rd3454; mov.u64 %rd12579, %rd12576; mov.u64 %rd12580, %rd3442; mov.u64 %rd12581, %rd3442; mov.u64 %rd12583, %rd3442; mov.u64 %rd12584, %rd3442; mov.u64 %rd12585, %rd12582; mov.u64 %rd12586, %rd3451; $L__BB1_1530: setp.eq.s64 %p2096, %rd12587, 0; mov.u64 %rd12606, 1; @%p2096 bra $L__BB1_1532; add.s64 %rd12587, %rd12587, -1; add.s64 %rd9406, %rd12574, 8; setp.eq.s64 %p2097, %rd12577, %rd12573; selp.b64 %rd9407, %rd9406, %rd12577, %p2097; add.s64 %rd9408, %rd12575, 8; selp.b64 %rd9409, %rd9408, %rd12578, %p2097; add.s64 %rd9410, %rd12576, 8; selp.b64 %rd9411, %rd9410, %rd12579, %p2097; mov.u64 %rd12606, 0; setp.eq.s64 %p2098, %rd12587, 0; add.s64 %rd9412, %rd9407, 4; add.s64 %rd9413, %rd9409, 4; add.s64 %rd9414, %rd9411, 4; selp.b64 %rd3706, %rd9407, %rd9412, %p2098; selp.b64 %rd12578, %rd9409, %rd9413, %p2098; selp.b64 %rd12579, %rd9411, %rd9414, %p2098; selp.b64 %rd12574, %rd9406, %rd12574, %p2097; selp.b64 %rd12575, %rd9408, %rd12575, %p2097; selp.b64 %rd12576, %rd9410, %rd12576, %p2097; add.s64 %rd9415, %rd12577, 8; selp.b64 %rd12573, %rd9415, %rd12573, %p2097; add.s64 %rd9416, %rd12583, 8; setp.eq.s64 %p2099, %rd12580, %rd12586; selp.b64 %rd9417, %rd9416, %rd12580, %p2099; add.s64 %rd9418, %rd12584, 8; selp.b64 %rd9419, %rd9418, %rd12581, %p2099; add.s64 %rd9420, %rd12585, 8; selp.b64 %rd9421, %rd9420, %rd12582, %p2099; selp.b64 %rd12583, %rd9416, %rd12583, %p2099; selp.b64 %rd12584, %rd9418, %rd12584, %p2099; selp.b64 %rd12585, %rd9420, %rd12585, %p2099; add.s64 %rd9422, %rd12580, 8; selp.b64 %rd12586, %rd9422, %rd12586, %p2099; add.s64 %rd9423, %rd9417, 4; add.s64 %rd9424, %rd9419, 4; add.s64 %rd9425, %rd9421, 4; selp.b64 %rd12580, %rd9417, %rd9423, %p2098; selp.b64 %rd12581, %rd9419, %rd9424, %p2098; selp.b64 %rd12582, %rd9421, %rd9425, %p2098; ld.local.f32 %f4243, [%rd9419]; ld.local.f32 %f4244, [%rd9409]; setp.eq.f32 %p2100, %f4244, %f4243; mov.u64 %rd12577, %rd3706; @%p2100 bra $L__BB1_1530; $L__BB1_1532: mov.u64 %rd11215, 0; or.b64 %rd12605, %rd11215, %rd3689; mov.u32 %r5239, 2; bra.uni $L__BB1_1537; $L__BB1_1533: div.rn.f32 %f5445, %f1089, %f1102; fma.rn.f32 %f4245, %f1080, %f5445, %f1077; mov.b32 %r3866, %f4245; fma.rn.f32 %f4246, %f1082, %f5445, %f1078; mov.b32 %r3867, %f4246; cvt.u64.u32 %rd9427, %r3867; cvt.u64.u32 %rd9428, %r3866; bfi.b64 %rd3730, %rd9427, %rd9428, 32, 32; st.local.u64 [%rd3442], %rd3730; mov.u64 %rd12603, 2; mov.u64 %rd12589, %rd3455; mov.u64 %rd12590, %rd3454; mov.u64 %rd12591, %rd3454; mov.u64 %rd12592, %rd9157; mov.u64 %rd12593, %rd3454; mov.u64 %rd12594, %rd3454; mov.u64 %rd12595, %rd9157; mov.u64 %rd12596, %rd3442; mov.u64 %rd12597, %rd3442; mov.u64 %rd12598, %rd9151; mov.u64 %rd12599, %rd3442; mov.u64 %rd12600, %rd3442; mov.u64 %rd12601, %rd9151; mov.u64 %rd12602, %rd3449; $L__BB1_1534: setp.eq.s64 %p2101, %rd12603, 0; mov.u64 %rd12606, 1; @%p2101 bra $L__BB1_1536; add.s64 %rd12603, %rd12603, -1; add.s64 %rd9433, %rd12590, 8; setp.eq.s64 %p2102, %rd12593, %rd12589; selp.b64 %rd9434, %rd9433, %rd12593, %p2102; add.s64 %rd9435, %rd12591, 8; selp.b64 %rd9436, %rd9435, %rd12594, %p2102; add.s64 %rd9437, %rd12592, 8; selp.b64 %rd9438, %rd9437, %rd12595, %p2102; mov.u64 %rd12606, 0; setp.eq.s64 %p2103, %rd12603, 0; add.s64 %rd9439, %rd9434, 4; add.s64 %rd9440, %rd9436, 4; add.s64 %rd9441, %rd9438, 4; selp.b64 %rd3747, %rd9434, %rd9439, %p2103; selp.b64 %rd12594, %rd9436, %rd9440, %p2103; selp.b64 %rd12595, %rd9438, %rd9441, %p2103; selp.b64 %rd12590, %rd9433, %rd12590, %p2102; selp.b64 %rd12591, %rd9435, %rd12591, %p2102; selp.b64 %rd12592, %rd9437, %rd12592, %p2102; add.s64 %rd9442, %rd12593, 8; selp.b64 %rd12589, %rd9442, %rd12589, %p2102; add.s64 %rd9443, %rd12599, 8; setp.eq.s64 %p2104, %rd12596, %rd12602; selp.b64 %rd9444, %rd9443, %rd12596, %p2104; add.s64 %rd9445, %rd12600, 8; selp.b64 %rd9446, %rd9445, %rd12597, %p2104; add.s64 %rd9447, %rd12601, 8; selp.b64 %rd9448, %rd9447, %rd12598, %p2104; selp.b64 %rd12599, %rd9443, %rd12599, %p2104; selp.b64 %rd12600, %rd9445, %rd12600, %p2104; selp.b64 %rd12601, %rd9447, %rd12601, %p2104; add.s64 %rd9449, %rd12596, 8; selp.b64 %rd12602, %rd9449, %rd12602, %p2104; add.s64 %rd9450, %rd9444, 4; add.s64 %rd9451, %rd9446, 4; add.s64 %rd9452, %rd9448, 4; selp.b64 %rd12596, %rd9444, %rd9450, %p2103; selp.b64 %rd12597, %rd9446, %rd9451, %p2103; selp.b64 %rd12598, %rd9448, %rd9452, %p2103; ld.local.f32 %f4247, [%rd9446]; ld.local.f32 %f4248, [%rd9436]; setp.eq.f32 %p2105, %f4248, %f4247; mov.u64 %rd12593, %rd3747; @%p2105 bra $L__BB1_1534; $L__BB1_1536: mov.u64 %rd11216, 0; or.b64 %rd12605, %rd11216, %rd3730; mov.u32 %r5239, 0; $L__BB1_1537: mov.f32 %f4249, 0f3F800000; sub.f32 %f4250, %f4249, %f5445; mov.b32 %r3870, %f4250; mov.b32 %r3871, %f5445; cvt.u64.u32 %rd9453, %r3871; cvt.u64.u32 %rd9454, %r3870; bfi.b64 %rd12655, %rd9453, %rd9454, 32, 32; mov.b64 {%r3872, %r3873}, %rd12606; mov.b64 {%r3874, %r3875}, %rd12605; cvt.u32.u64 %r5237, %rd12605; mov.b32 %f5446, %r3875; mov.u32 %r5238, 1; mov.b32 {%rs1035, %rs754}, %r3872; bra.uni $L__BB1_1550; $L__BB1_1518: setp.lt.f32 %p2089, %f1109, %f1110; @%p2089 bra $L__BB1_1520; bra.uni $L__BB1_1519; $L__BB1_1520: mul.f32 %f5443, %f1086, %f1106; fma.rn.f32 %f5441, %f1084, %f1106, %f1077; mov.u32 %r5239, 2; mov.f32 %f5442, %f1078; mov.f32 %f5444, %f1106; bra.uni $L__BB1_1524; $L__BB1_1522: mul.f32 %f5443, %f1100, %f5444; fma.rn.f32 %f5441, %f1099, %f5444, %f1079; mov.u32 %r5239, 1; bra.uni $L__BB1_1524; $L__BB1_1519: mul.f32 %f5443, %f1100, %f5444; fma.rn.f32 %f5441, %f1099, %f5444, %f1079; mov.u32 %r5239, 1; $L__BB1_1524: add.f32 %f5446, %f5442, %f5443; mov.f32 %f4233, 0f3F800000; sub.f32 %f4234, %f4233, %f5444; mov.b32 %r3858, %f4234; mov.b32 %r3859, %f5444; cvt.u64.u32 %rd9370, %r3859; cvt.u64.u32 %rd9371, %r3858; bfi.b64 %rd12655, %rd9370, %rd9371, 32, 32; mov.b32 %r5237, %f5441; mov.u32 %r5238, 1; mov.u16 %rs1035, 1; $L__BB1_1550: mov.b32 %f4257, %r5237; sub.f32 %f4258, %f4257, %f1074; mul.f32 %f4259, %f4258, %f4258; sub.f32 %f4260, %f5446, %f1075; fma.rn.f32 %f4261, %f4260, %f4260, %f4259; add.f32 %f4262, %f4261, 0f00000000; sqrt.rn.f32 %f4263, %f4262; shl.b64 %rd9536, %rd3629, 2; add.s64 %rd9537, %rd24, %rd9536; st.local.f32 [%rd9537+-4], %f4263; mul.lo.s64 %rd9538, %rd3629, 36; add.s64 %rd9539, %rd3434, %rd9538; st.local.u32 [%rd9539+-36], %r5237; st.local.f32 [%rd9539+-32], %f5446; mov.u16 %rs767, 0; st.local.v4.u8 [%rd9539+-28], {%rs1035, %rs767, %rs767, %rs767}; st.local.u32 [%rd9539+-24], %r1128; st.local.u32 [%rd9539+-20], %r5238; st.local.u32 [%rd9539+-16], %r5239; shr.u64 %rd9540, %rd12655, 32; st.local.u32 [%rd9539+-8], %rd9540; st.local.u32 [%rd9539+-12], %rd12655; $L__BB1_1551: setp.lt.u64 %p2121, %rd3629, 4; add.s64 %rd3629, %rd3629, 1; @%p2121 bra $L__BB1_1498; ld.local.v2.u64 {%rd12656, %rd12657}, [%rd24]; ld.local.v4.u32 {%r5249, %r5250, %r5251, %r3903}, [%rd3434]; ld.local.u32 %r5252, [%rd3434+16]; ld.local.u32 %rd9543, [%rd3434+36]; ld.local.u32 %rd9544, [%rd3434+40]; bfi.b64 %rd9545, %rd9544, %rd9543, 32, 32; mov.b64 {%r5246, %r5247}, %rd9545; ld.local.u32 %r5248, [%rd3434+44]; ld.local.u32 %r5253, [%rd3434+52]; ld.local.u32 %r5245, [%rd3434+80]; ld.local.u64 %rd9546, [%rd3434+72]; mov.b64 {%r5243, %r5244}, %rd9546; ld.local.u32 %r5254, [%rd3434+88]; ld.local.u32 %rd9547, [%rd3434+108]; ld.local.u32 %rd9548, [%rd3434+112]; bfi.b64 %rd9549, %rd9548, %rd9547, 32, 32; mov.b64 {%r5240, %r5241}, %rd9549; ld.local.u32 %r5242, [%rd3434+116]; ld.local.u32 %r5255, [%rd3434+124]; bra.uni $L__BB1_1553; $L__BB1_1496: mov.u32 %r5252, 4; mov.u32 %r5253, %r5252; mov.u32 %r5254, %r5252; mov.u32 %r5255, %r5252; $L__BB1_1553: and.b64 %rd9550, %rd3624, 1; setp.eq.b64 %p2122, %rd9550, 1; mov.pred %p2123, 0; xor.pred %p2124, %p2122, %p2123; not.pred %p2125, %p2124; mov.b64 {%r1171, %r1172}, %rd12656; mov.b32 %f1133, %r1171; mov.b32 %f1134, %r1172; mov.b64 {%r1173, %r1174}, %rd12657; mov.b32 %f1135, %r1173; mov.b32 %f1136, %r1174; @%p2125 bra $L__BB1_1562; bra.uni $L__BB1_1554; $L__BB1_1562: and.b64 %rd9566, %rd3624, 2; setp.eq.s64 %p2139, %rd9566, 0; @%p2139 bra $L__BB1_1571; bra.uni $L__BB1_1563; $L__BB1_1571: and.b64 %rd9582, %rd3624, 4; setp.eq.s64 %p2153, %rd9582, 0; @%p2153 bra $L__BB1_1580; bra.uni $L__BB1_1572; $L__BB1_1580: and.b64 %rd9598, %rd3624, 8; setp.eq.s64 %p2167, %rd9598, 0; @%p2167 bra $L__BB1_1488; ld.u8 %rs774, [%rd3613+88]; and.b16 %rs775, %rs774, 1; setp.eq.b16 %p2168, %rs775, 1; mov.pred %p2169, 0; xor.pred %p2170, %p2168, %p2169; not.pred %p2171, %p2170; @%p2171 bra $L__BB1_1584; bra.uni $L__BB1_1582; $L__BB1_1584: ld.u32 %r1222, [%rd3613+76]; cvt.u64.u32 %rd9602, %r1222; setp.le.u64 %p2178, %rd3601, %rd9602; @%p2178 bra $L__BB1_1488; neg.f32 %f1140, %f1136; setp.lt.u32 %p2179, %r1127, 64; @%p2179 bra $L__BB1_1587; bra.uni $L__BB1_1586; $L__BB1_1587: mul.wide.u32 %rd9612, %r1127, 8; add.s64 %rd9613, %rd3438, %rd9612; mov.u64 %rd12664, 0; st.local.u32 [%rd9613], %r1222; st.local.f32 [%rd9613+4], %f1140; add.s32 %r1127, %r1127, 1; st.local.u32 [%rd3438+512], %r1127; mov.u64 %rd12665, %rd12664; bra.uni $L__BB1_1588; $L__BB1_1554: ld.u8 %rs768, [%rd3613+88]; and.b16 %rs769, %rs768, 1; setp.eq.b16 %p2126, %rs769, 1; xor.pred %p2128, %p2126, %p2123; not.pred %p2129, %p2128; @%p2129 bra $L__BB1_1557; bra.uni $L__BB1_1555; $L__BB1_1557: ld.u32 %r1180, [%rd3613+64]; cvt.u64.u32 %rd9554, %r1180; setp.le.u64 %p2136, %rd3601, %rd9554; @%p2136 bra $L__BB1_1562; neg.f32 %f1137, %f1133; setp.lt.u32 %p2137, %r1127, 64; @%p2137 bra $L__BB1_1560; bra.uni $L__BB1_1559; $L__BB1_1560: add.s32 %r3906, %r1126, -1; mul.wide.u32 %rd9564, %r3906, 8; add.s64 %rd9565, %rd3438, %rd9564; mov.u64 %rd12658, 0; st.local.u32 [%rd9565], %r1180; st.local.f32 [%rd9565+4], %f1137; add.s32 %r1127, %r1127, 1; st.local.u32 [%rd3438+512], %r1127; mov.u64 %rd12659, %rd12658; bra.uni $L__BB1_1561; $L__BB1_1563: ld.u8 %rs770, [%rd3613+88]; and.b16 %rs771, %rs770, 1; setp.eq.b16 %p2140, %rs771, 1; mov.pred %p2141, 0; xor.pred %p2142, %p2140, %p2141; not.pred %p2143, %p2142; @%p2143 bra $L__BB1_1566; bra.uni $L__BB1_1564; $L__BB1_1566: ld.u32 %r1194, [%rd3613+68]; cvt.u64.u32 %rd9570, %r1194; setp.le.u64 %p2150, %rd3601, %rd9570; @%p2150 bra $L__BB1_1571; neg.f32 %f1138, %f1134; setp.lt.u32 %p2151, %r1127, 64; @%p2151 bra $L__BB1_1569; bra.uni $L__BB1_1568; $L__BB1_1569: mul.wide.u32 %rd9580, %r1127, 8; add.s64 %rd9581, %rd3438, %rd9580; mov.u64 %rd12660, 0; st.local.u32 [%rd9581], %r1194; st.local.f32 [%rd9581+4], %f1138; add.s32 %r1127, %r1127, 1; st.local.u32 [%rd3438+512], %r1127; mov.u64 %rd12661, %rd12660; bra.uni $L__BB1_1570; $L__BB1_1572: ld.u8 %rs772, [%rd3613+88]; and.b16 %rs773, %rs772, 1; setp.eq.b16 %p2154, %rs773, 1; mov.pred %p2155, 0; xor.pred %p2156, %p2154, %p2155; not.pred %p2157, %p2156; @%p2157 bra $L__BB1_1575; bra.uni $L__BB1_1573; $L__BB1_1575: ld.u32 %r1208, [%rd3613+72]; cvt.u64.u32 %rd9586, %r1208; setp.le.u64 %p2164, %rd3601, %rd9586; @%p2164 bra $L__BB1_1580; neg.f32 %f1139, %f1135; setp.lt.u32 %p2165, %r1127, 64; @%p2165 bra $L__BB1_1578; bra.uni $L__BB1_1577; $L__BB1_1578: mul.wide.u32 %rd9596, %r1127, 8; add.s64 %rd9597, %rd3438, %rd9596; mov.u64 %rd12662, 0; st.local.u32 [%rd9597], %r1208; st.local.f32 [%rd9597+4], %f1139; add.s32 %r1127, %r1127, 1; st.local.u32 [%rd3438+512], %r1127; mov.u64 %rd12663, %rd12662; bra.uni $L__BB1_1579; $L__BB1_1555: setp.leu.f32 %p2130, %f1076, %f1133; setp.eq.s32 %p2131, %r5252, 4; or.pred %p2132, %p2131, %p2130; @%p2132 bra $L__BB1_1562; ld.u32 %r3904, [%rd3613+64]; cvt.u64.u32 %rd9551, %r3904; setp.le.u64 %p2133, %rd3604, %rd9551; mul.wide.u32 %rd9552, %r3904, 12; add.s64 %rd9553, %rd3605, %rd9552; setp.eq.s64 %p2134, %rd9553, 0; or.pred %p2135, %p2133, %p2134; selp.b32 %r1122, %r1122, %r5251, %p2135; selp.b32 %r1121, %r1121, %r5250, %p2135; selp.b32 %r1120, %r1120, %r5249, %p2135; selp.b32 %r1124, %r1124, %r5252, %p2135; selp.b32 %r1125, %r1125, %r1171, %p2135; bra.uni $L__BB1_1562; $L__BB1_1582: mov.b32 %f4266, %r1125; setp.leu.f32 %p2172, %f4266, %f1136; setp.eq.s32 %p2173, %r5255, 4; or.pred %p2174, %p2173, %p2172; @%p2174 bra $L__BB1_1488; bra.uni $L__BB1_1583; $L__BB1_1564: mov.b32 %f4264, %r1125; setp.leu.f32 %p2144, %f4264, %f1134; setp.eq.s32 %p2145, %r5253, 4; or.pred %p2146, %p2145, %p2144; @%p2146 bra $L__BB1_1571; ld.u32 %r3912, [%rd3613+68]; cvt.u64.u32 %rd9567, %r3912; setp.le.u64 %p2147, %rd3604, %rd9567; mul.wide.u32 %rd9568, %r3912, 12; add.s64 %rd9569, %rd3605, %rd9568; setp.eq.s64 %p2148, %rd9569, 0; or.pred %p2149, %p2147, %p2148; selp.b32 %r1122, %r1122, %r5248, %p2149; selp.b32 %r1121, %r1121, %r5247, %p2149; selp.b32 %r1120, %r1120, %r5246, %p2149; selp.b32 %r1124, %r1124, %r5253, %p2149; selp.b32 %r1125, %r1125, %r1172, %p2149; bra.uni $L__BB1_1571; $L__BB1_1573: mov.b32 %f4265, %r1125; setp.leu.f32 %p2158, %f4265, %f1135; setp.eq.s32 %p2159, %r5254, 4; or.pred %p2160, %p2159, %p2158; @%p2160 bra $L__BB1_1580; ld.u32 %r3919, [%rd3613+72]; cvt.u64.u32 %rd9583, %r3919; setp.le.u64 %p2161, %rd3604, %rd9583; mul.wide.u32 %rd9584, %r3919, 12; add.s64 %rd9585, %rd3605, %rd9584; setp.eq.s64 %p2162, %rd9585, 0; or.pred %p2163, %p2161, %p2162; selp.b32 %r1122, %r1122, %r5245, %p2163; selp.b32 %r1121, %r1121, %r5244, %p2163; selp.b32 %r1120, %r1120, %r5243, %p2163; selp.b32 %r1124, %r1124, %r5254, %p2163; selp.b32 %r1125, %r1125, %r1173, %p2163; bra.uni $L__BB1_1580; $L__BB1_1586: mov.u64 %rd12665, 1; shl.b64 %rd12664, %rd9602, 32; $L__BB1_1588: mov.u64 %rd11229, 0; cvt.u32.u64 %r3928, %rd11229; cvt.u32.u64 %r3929, %rd12664; or.b32 %r3930, %r3929, %r3928; cvt.u32.u64 %r3931, %rd12665; or.b32 %r3932, %r3930, %r3931; setp.eq.s32 %p2180, %r3932, 0; @%p2180 bra $L__BB1_1488; bra.uni $L__BB1_1589; $L__BB1_1559: mov.u64 %rd12659, 1; shl.b64 %rd12658, %rd9554, 32; $L__BB1_1561: mov.u64 %rd11220, 0; cvt.u32.u64 %r3907, %rd11220; cvt.u32.u64 %r3908, %rd12658; or.b32 %r3909, %r3908, %r3907; cvt.u32.u64 %r3910, %rd12659; or.b32 %r3911, %r3909, %r3910; setp.ne.s32 %p2138, %r3911, 0; @%p2138 bra $L__BB1_1589; bra.uni $L__BB1_1562; $L__BB1_1568: mov.u64 %rd12661, 1; shl.b64 %rd12660, %rd9570, 32; $L__BB1_1570: mov.u64 %rd11223, 0; cvt.u32.u64 %r3914, %rd11223; cvt.u32.u64 %r3915, %rd12660; or.b32 %r3916, %r3915, %r3914; cvt.u32.u64 %r3917, %rd12661; or.b32 %r3918, %r3916, %r3917; setp.ne.s32 %p2152, %r3918, 0; @%p2152 bra $L__BB1_1589; bra.uni $L__BB1_1571; $L__BB1_1577: mov.u64 %rd12663, 1; shl.b64 %rd12662, %rd9586, 32; $L__BB1_1579: mov.u64 %rd11226, 0; cvt.u32.u64 %r3921, %rd11226; cvt.u32.u64 %r3922, %rd12662; or.b32 %r3923, %r3922, %r3921; cvt.u32.u64 %r3924, %rd12663; or.b32 %r3925, %r3923, %r3924; setp.ne.s32 %p2166, %r3925, 0; @%p2166 bra $L__BB1_1589; bra.uni $L__BB1_1580; $L__BB1_1590: setp.eq.s32 %p2181, %r1124, 4; mov.u64 %rd12666, %rd9286; mov.u64 %rd12667, %rd9284; mov.u64 %rd12668, %rd9286; @%p2181 bra $L__BB1_1592; mov.b64 %rd12668, {%r1120, %r1121}; mov.b32 {%rs776, %rs777}, %r1122; mov.b64 %rd9620, {%r1122, %r3933}; and.b64 %rd12666, %rd9620, 4294967040; cvt.u64.u16 %rd9621, %rs776; and.b64 %rd12667, %rd9621, 255; $L__BB1_1592: or.b64 %rd9628, %rd12667, %rd12666; or.b64 %rd9629, %rd9628, %rd9286; mov.b64 {%r3934, %r3935}, %rd9629; mov.b32 {%rs156, %rs778}, %r3934; and.b16 %rs779, %rs156, 255; setp.eq.s16 %p2182, %rs779, 2; @%p2182 bra $L__BB1_1594; cvt.u32.u64 %r3936, %rd12668; mov.b32 %f4267, %r3936; shr.u64 %rd9630, %rd12668, 32; cvt.u32.u64 %r3937, %rd9630; mov.b32 %f4268, %r3937; mul.f32 %f4269, %f1073, %f4267; mul.f32 %f4270, %f1072, %f4268; sub.f32 %f4271, %f4269, %f4270; mul.f32 %f4272, %f1073, %f4268; fma.rn.f32 %f4273, %f1072, %f4267, %f4272; add.f32 %f4274, %f1070, %f4271; mov.b32 %r3938, %f4274; add.f32 %f4275, %f1071, %f4273; mov.b32 %r3939, %f4275; cvt.u64.u32 %rd9631, %r3939; cvt.u64.u32 %rd9632, %r3938; cvt.u64.u16 %rd9633, %rs156; bfi.b64 %rd9286, %rd9631, %rd9632, 32, 32; and.b64 %rd9634, %rd9633, 255; mov.b64 {%r3940, %r3941}, %rd9634; mov.b32 {%rs780, %rs781}, %r3940; cvt.u64.u16 %rd9284, %rs780; $L__BB1_1594: mov.u64 %rd11238, 0; or.b64 %rd9641, %rd11238, %rd9284; or.b64 %rd3953, %rd9641, %rd11238; mov.b64 {%r3942, %r3943}, %rd3953; mov.b32 {%rs157, %rs782}, %r3942; and.b16 %rs783, %rs157, 255; setp.eq.s16 %p2183, %rs783, 2; mov.u64 %rd12671, 2; mov.u64 %rd12672, %rd11238; mov.u64 %rd12673, %rd11238; @%p2183 bra $L__BB1_1596; and.b64 %rd9643, %rd3953, 4294967040; cvt.u64.u16 %rd9644, %rs157; and.b64 %rd9645, %rd9644, 255; or.b64 %rd9646, %rd9645, %rd11238; or.b64 %rd9647, %rd9646, %rd9643; mov.b64 {%r3944, %r3945}, %rd9647; mov.b32 {%rs784, %rs785}, %r3944; not.b16 %rs786, %rs784; ld.global.u8 %rs787, [%rd3470+240]; setp.eq.s16 %p2184, %rs787, 0; and.b16 %rs788, %rs786, 1; selp.b16 %rs789, %rs784, %rs788, %p2184; and.b64 %rd9648, %rd9647, 4294967040; cvt.u64.u16 %rd9649, %rs789; and.b64 %rd9650, %rd9649, 255; or.b64 %rd9651, %rd9648, %rd11238; or.b64 %rd9652, %rd9651, %rd9650; mov.b64 {%r3946, %r3947}, %rd9652; mov.b32 {%rs790, %rs791}, %r3946; and.b64 %rd12673, %rd9652, 4294967040; cvt.u64.u16 %rd9653, %rs790; and.b64 %rd12671, %rd9653, 255; mov.u64 %rd12672, %rd9286; $L__BB1_1596: or.b64 %rd9654, %rd12672, %rd11238; or.b64 %rd9655, %rd11238, %rd12671; or.b64 %rd9656, %rd9655, %rd12673; or.b64 %rd9657, %rd9654, %rd11238; mov.b64 {%r5286, %r5287}, %rd9657; mov.b64 {%r5288, %r3948}, %rd9656; bra.uni $L__BB1_1653; $L__BB1_1456: cvt.u32.u64 %r3741, %rd3475; cvt.u32.u64 %r3742, %rd3500; rem.u32 %r3743, %r3742, %r3741; cvt.u64.u32 %rd12513, %r3743; $L__BB1_1457: shl.b64 %rd9191, %rd12513, 3; add.s64 %rd3504, %rd3476, %rd9191; ld.u32 %rd9192, [%rd3504]; ld.u32 %rd9193, [%rd3504+4]; bfi.b64 %rd3505, %rd9193, %rd9192, 32, 32; add.s64 %rd3506, %rd12513, 1; or.b64 %rd9194, %rd3506, %rd3475; and.b64 %rd9195, %rd9194, -4294967296; setp.eq.s64 %p2002, %rd9195, 0; @%p2002 bra $L__BB1_1459; rem.u64 %rd12514, %rd3506, %rd3475; bra.uni $L__BB1_1460; $L__BB1_1459: cvt.u32.u64 %r3744, %rd3475; cvt.u32.u64 %r3745, %rd3506; rem.u32 %r3746, %r3745, %r3744; cvt.u64.u32 %rd12514, %r3746; $L__BB1_1460: add.u64 %rd12524, %SP, 560; shl.b64 %rd9197, %rd12514, 3; add.s64 %rd3516, %rd3476, %rd9197; ld.u32 %rd9198, [%rd3516]; ld.u32 %rd9199, [%rd3516+4]; bfi.b64 %rd9200, %rd9199, %rd9198, 32, 32; st.local.v2.u64 [%rd3434], {%rd3505, %rd9200}; mov.u64 %rd12529, 2; mov.u64 %rd12515, %rd3463; mov.u64 %rd12516, %rd3461; mov.u64 %rd12517, %rd3461; mov.u64 %rd12518, %rd3462; mov.u64 %rd12519, %rd3461; mov.u64 %rd12520, %rd3461; mov.u64 %rd12521, %rd3462; mov.u64 %rd12522, %rd3434; mov.u64 %rd12523, %rd3434; mov.u64 %rd12525, %rd3434; mov.u64 %rd12526, %rd3434; mov.u64 %rd12527, %rd12524; mov.u64 %rd12528, %rd3464; $L__BB1_1461: setp.eq.s64 %p2003, %rd12529, 0; @%p2003 bra $L__BB1_1464; add.s64 %rd12529, %rd12529, -1; add.s64 %rd9201, %rd12516, 8; setp.eq.s64 %p2004, %rd12519, %rd12515; selp.b64 %rd9202, %rd9201, %rd12519, %p2004; add.s64 %rd9203, %rd12517, 8; selp.b64 %rd9204, %rd9203, %rd12520, %p2004; add.s64 %rd9205, %rd12518, 8; selp.b64 %rd9206, %rd9205, %rd12521, %p2004; setp.eq.s64 %p2005, %rd12529, 0; add.s64 %rd9207, %rd9202, 4; add.s64 %rd9208, %rd9204, 4; add.s64 %rd9209, %rd9206, 4; selp.b64 %rd3533, %rd9202, %rd9207, %p2005; selp.b64 %rd12520, %rd9204, %rd9208, %p2005; selp.b64 %rd12521, %rd9206, %rd9209, %p2005; selp.b64 %rd12516, %rd9201, %rd12516, %p2004; selp.b64 %rd12517, %rd9203, %rd12517, %p2004; selp.b64 %rd12518, %rd9205, %rd12518, %p2004; add.s64 %rd9210, %rd12519, 8; selp.b64 %rd12515, %rd9210, %rd12515, %p2004; add.s64 %rd9211, %rd12525, 8; setp.eq.s64 %p2006, %rd12522, %rd12528; selp.b64 %rd9212, %rd9211, %rd12522, %p2006; add.s64 %rd9213, %rd12526, 8; selp.b64 %rd9214, %rd9213, %rd12523, %p2006; add.s64 %rd9215, %rd12527, 8; selp.b64 %rd9216, %rd9215, %rd12524, %p2006; selp.b64 %rd12525, %rd9211, %rd12525, %p2006; selp.b64 %rd12526, %rd9213, %rd12526, %p2006; selp.b64 %rd12527, %rd9215, %rd12527, %p2006; add.s64 %rd9217, %rd12522, 8; selp.b64 %rd12528, %rd9217, %rd12528, %p2006; add.s64 %rd9218, %rd9212, 4; add.s64 %rd9219, %rd9214, 4; add.s64 %rd9220, %rd9216, 4; selp.b64 %rd12522, %rd9212, %rd9218, %p2005; selp.b64 %rd12523, %rd9214, %rd9219, %p2005; selp.b64 %rd12524, %rd9216, %rd9220, %p2005; ld.local.f32 %f4038, [%rd9214]; ld.local.f32 %f4039, [%rd9204]; setp.eq.f32 %p2007, %f4039, %f4038; mov.u64 %rd12519, %rd3533; @%p2007 bra $L__BB1_1461; bra.uni $L__BB1_1463; $L__BB1_1464: ld.u32 %rd9221, [%rd3504]; ld.u32 %rd9222, [%rd3504+4]; bfi.b64 %rd9223, %rd9222, %rd9221, 32, 32; cvt.u32.u64 %r3747, %rd9223; mov.b32 %f4040, %r3747; shr.u64 %rd9224, %rd9223, 32; cvt.u32.u64 %r3748, %rd9224; mov.b32 %f4041, %r3748; ld.u32 %rd9225, [%rd3516]; ld.u32 %rd9226, [%rd3516+4]; bfi.b64 %rd9227, %rd9226, %rd9225, 32, 32; cvt.u32.u64 %r3749, %rd9227; shr.u64 %rd9228, %rd9227, 32; cvt.u32.u64 %r3750, %rd9228; mov.b32 %f4042, %r3749; sub.f32 %f5439, %f4042, %f4040; mov.b32 %f4043, %r3750; sub.f32 %f5440, %f4043, %f4041; bra.uni $L__BB1_1475; $L__BB1_1469: cvt.u32.u64 %r3751, %rd3475; cvt.u32.u64 %r3752, %rd3547; rem.u32 %r3753, %r3752, %r3751; cvt.u64.u32 %rd12530, %r3753; $L__BB1_1470: shl.b64 %rd9237, %rd12530, 3; add.s64 %rd9238, %rd3476, %rd9237; ld.u32 %rd9239, [%rd9238]; ld.u32 %rd9240, [%rd9238+4]; bfi.b64 %rd3558, %rd9240, %rd9239, 32, 32; st.local.v2.u64 [%rd3434], {%rd3548, %rd3558}; mov.u64 %rd12545, 2; mov.u64 %rd12531, %rd3461; mov.u64 %rd12532, %rd3457; mov.u64 %rd12533, %rd3457; mov.u64 %rd12534, %rd3460; mov.u64 %rd12535, %rd3457; mov.u64 %rd12536, %rd3457; mov.u64 %rd12537, %rd3460; mov.u64 %rd12538, %rd3465; mov.u64 %rd12539, %rd3465; mov.u64 %rd12540, %rd3466; mov.u64 %rd12541, %rd3465; mov.u64 %rd12542, %rd3465; mov.u64 %rd12543, %rd3466; mov.u64 %rd12544, %rd3467; $L__BB1_1471: setp.eq.s64 %p2011, %rd12545, 0; @%p2011 bra $L__BB1_1474; add.s64 %rd12545, %rd12545, -1; add.s64 %rd9241, %rd12532, 8; setp.eq.s64 %p2012, %rd12535, %rd12531; selp.b64 %rd9242, %rd9241, %rd12535, %p2012; add.s64 %rd9243, %rd12533, 8; selp.b64 %rd9244, %rd9243, %rd12536, %p2012; add.s64 %rd9245, %rd12534, 8; selp.b64 %rd9246, %rd9245, %rd12537, %p2012; setp.eq.s64 %p2013, %rd12545, 0; add.s64 %rd9247, %rd9242, 4; add.s64 %rd9248, %rd9244, 4; add.s64 %rd9249, %rd9246, 4; selp.b64 %rd3575, %rd9242, %rd9247, %p2013; selp.b64 %rd12536, %rd9244, %rd9248, %p2013; selp.b64 %rd12537, %rd9246, %rd9249, %p2013; selp.b64 %rd12532, %rd9241, %rd12532, %p2012; selp.b64 %rd12533, %rd9243, %rd12533, %p2012; selp.b64 %rd12534, %rd9245, %rd12534, %p2012; add.s64 %rd9250, %rd12535, 8; selp.b64 %rd12531, %rd9250, %rd12531, %p2012; add.s64 %rd9251, %rd12541, 8; setp.eq.s64 %p2014, %rd12538, %rd12544; selp.b64 %rd9252, %rd9251, %rd12538, %p2014; add.s64 %rd9253, %rd12542, 8; selp.b64 %rd9254, %rd9253, %rd12539, %p2014; add.s64 %rd9255, %rd12543, 8; selp.b64 %rd9256, %rd9255, %rd12540, %p2014; selp.b64 %rd12541, %rd9251, %rd12541, %p2014; selp.b64 %rd12542, %rd9253, %rd12542, %p2014; selp.b64 %rd12543, %rd9255, %rd12543, %p2014; add.s64 %rd9257, %rd12538, 8; selp.b64 %rd12544, %rd9257, %rd12544, %p2014; add.s64 %rd9258, %rd9252, 4; add.s64 %rd9259, %rd9254, 4; add.s64 %rd9260, %rd9256, 4; selp.b64 %rd12538, %rd9252, %rd9258, %p2013; selp.b64 %rd12539, %rd9254, %rd9259, %p2013; selp.b64 %rd12540, %rd9256, %rd9260, %p2013; ld.local.f32 %f4044, [%rd9254]; ld.local.f32 %f4045, [%rd9244]; setp.eq.f32 %p2015, %f4045, %f4044; mov.u64 %rd12535, %rd3575; @%p2015 bra $L__BB1_1471; bra.uni $L__BB1_1473; $L__BB1_1474: cvt.u32.u64 %r3754, %rd3548; mov.b32 %f4046, %r3754; shr.u64 %rd9261, %rd3548, 32; cvt.u32.u64 %r3755, %rd9261; mov.b32 %f4047, %r3755; shr.u64 %rd9262, %rd3558, 32; cvt.u32.u64 %r3756, %rd9262; cvt.u32.u64 %r3757, %rd3558; mov.b32 %f4048, %r3757; sub.f32 %f4049, %f4048, %f4046; mov.b32 %f4050, %r3756; sub.f32 %f4051, %f4050, %f4047; neg.f32 %f5439, %f4049; neg.f32 %f5440, %f4051; $L__BB1_1475: mul.f32 %f4052, %f1062, %f5440; fma.rn.f32 %f1069, %f1061, %f5439, %f4052; mul.f32 %f4053, %f5440, %f5440; fma.rn.f32 %f4054, %f5439, %f5439, %f4053; add.f32 %f4055, %f4054, 0f00000000; sqrt.rn.f32 %f4056, %f4055; mul.f32 %f4057, %f4056, 0f3A83126F; abs.f32 %f4058, %f1069; setp.gt.f32 %p2016, %f4058, %f4057; @%p2016 bra $L__BB1_1477; bra.uni $L__BB1_1476; $L__BB1_1477: setp.ge.f32 %p2944, %f1069, 0f00000000; bra.uni $L__BB1_1480; $L__BB1_1476: ld.local.u64 %rd9263, [%rd3438+8]; cvt.u32.u64 %r3758, %rd9263; mov.b32 %f4059, %r3758; shr.u64 %rd9264, %rd9263, 32; cvt.u32.u64 %r3759, %rd9264; mov.b32 %f4060, %r3759; sub.f32 %f4061, %f859, %f4059; sub.f32 %f4062, %f860, %f4060; mul.f32 %f4063, %f1062, %f4062; fma.rn.f32 %f4064, %f1061, %f4061, %f4063; setp.le.f32 %p2944, %f4064, 0f00000000; $L__BB1_1480: selp.u16 %rs727, 1, 0, %p2944; st.local.u8 [%rd3438+16], %rs727; $L__BB1_1481: setp.eq.s32 %p2945, %r1106, 2; ld.local.v2.u32 {%r5225, %r5226}, [%rd3438+8]; ld.local.u32 %r5227, [%rd3438+16]; $L__BB1_1483: mov.u64 %rd9272, 0; mov.u64 %rd12546, 2; mov.u64 %rd12547, %rd9272; @%p2945 bra $L__BB1_1485; setp.ne.s16 %p2017, %rs141, 0; cvt.u16.u32 %rs729, %r5227; selp.u16 %rs730, 1, 0, %p2017; xor.b16 %rs731, %rs729, %rs730; mov.b32 %f4071, %r5225; mov.b32 %f4072, %r5226; mul.f32 %f4073, %f1032, %f4071; mul.f32 %f4074, %f1031, %f4072; sub.f32 %f4075, %f4073, %f4074; mul.f32 %f4076, %f1031, %f4071; fma.rn.f32 %f4077, %f1032, %f4072, %f4076; add.f32 %f4078, %f1029, %f4075; mov.b32 %r3764, %f4078; add.f32 %f4079, %f1030, %f4077; mov.b32 %r3765, %f4079; cvt.u64.u32 %rd9273, %r3765; cvt.u64.u32 %rd9274, %r3764; cvt.u64.u16 %rd9275, %rs731; bfi.b64 %rd12547, %rd9273, %rd9274, 32, 32; and.b64 %rd9276, %rd9275, 255; mov.b64 {%r3766, %r3767}, %rd9276; mov.b32 {%rs732, %rs733}, %r3766; cvt.u64.u16 %rd12546, %rs732; $L__BB1_1485: or.b64 %rd9277, %rd9272, %rd9272; or.b64 %rd9278, %rd12546, %rd9272; or.b64 %rd9279, %rd9278, %rd9272; or.b64 %rd9280, %rd9277, %rd12547; mov.b64 {%r5286, %r5287}, %rd9280; mov.b64 {%r5288, %r3768}, %rd9279; $L__BB1_1653: mov.b32 {%rs162, %rs801}, %r5288; and.b16 %rs802, %rs162, 255; setp.eq.s16 %p2262, %rs802, 2; @%p2262 bra $L__BB1_1655; mov.b64 %rd9730, {%r5288, %r4012}; shr.u64 %rd9731, %rd9730, 8; and.b64 %rd9732, %rd9731, 16777215; cvt.u64.u16 %rd9733, %rs162; and.b64 %rd9734, %rd9733, 255; mov.b64 %rd9735, {%r5286, %r5287}; bfi.b64 %rd9736, %rd9732, %rd9734, 8, 56; mov.b64 {%r4013, %r4014}, %rd9736; mov.b32 {%rs803, %rs804}, %r4013; shr.u64 %rd9737, %rd9735, 32; cvt.u32.u64 %r4015, %rd9737; mov.b32 %f4376, %r5286; sub.f32 %f4377, %f4376, %f859; mov.b32 %f4378, %r4015; sub.f32 %f4379, %f4378, %f860; mul.f32 %f4380, %f4379, %f4379; fma.rn.f32 %f4381, %f4377, %f4377, %f4380; add.f32 %f4382, %f4381, 0f00000000; sqrt.rn.f32 %f4383, %f4382; and.b16 %rs805, %rs803, 1; setp.eq.b16 %p2263, %rs805, 1; selp.f32 %f4384, 0fBF800000, 0f3F800000, %p2263; mul.f32 %f4385, %f4384, %f4383; setp.ge.f32 %p2264, %f4385, %f1207; setp.le.f32 %p2265, %f4385, %f1207; selp.b16 %rs806, 1, 2, %p2265; setp.gtu.f32 %p2266, %f4385, %f1207; selp.b16 %rs807, -1, 0, %p2266; selp.b16 %rs808, %rs807, %rs806, %p2264; setp.eq.s16 %p2267, %rs808, 1; selp.f32 %f1207, %f4385, %f1207, %p2267; $L__BB1_1655: add.s64 %rd3470, %rd3470, 280; add.s64 %rd3471, %rd3471, 280; setp.ne.s64 %p2268, %rd2883, 0; add.s64 %rd3469, %rd3469, 280; @%p2268 bra $L__BB1_1425; $L__BB1_1656: cvta.to.global.u64 %rd4640, %rd5233; sub.f32 %f1208, %f2, %f178; sub.f32 %f1209, %f3, %f177; mov.u32 %r4019, 2; mov.u64 %rd9740, 0; mov.u64 %rd4642, %rd5233; mov.u64 %rd4049, %rd9740; mov.u64 %rd9757, %rd9740; @%p423 bra $L__BB1_1893; ld.param.u64 %rd4049, [grid_update_param_3]; add.u64 %rd9741, %SP, 560; add.u64 %rd4011, %SPL, 560; add.u64 %rd9745, %SP, 32; add.u64 %rd4015, %SPL, 32; add.s64 %rd4017, %rd4011, 8; add.u64 %rd9748, %SP, 0; add.u64 %rd4019, %SPL, 0; add.s64 %rd4020, %rd4019, 8; add.s64 %rd4022, %rd4019, 8; add.s64 %rd4024, %rd4019, 8; add.s64 %rd4026, %rd4019, 8; add.s64 %rd4028, %rd4019, 8; add.s64 %rd4030, %rd4019, 8; add.u64 %rd9754, %SP, 552; add.u64 %rd4031, %SPL, 552; add.s64 %rd4032, %rd4031, 8; add.s64 %rd4034, %rd4015, 36; add.s64 %rd4036, %rd4015, 4; add.s64 %rd4037, %rd9745, 36; add.s64 %rd4038, %rd4015, 44; add.s64 %rd4039, %rd9745, 44; add.s64 %rd4040, %rd4015, 52; add.s64 %rd4041, %rd4011, 8; add.s64 %rd4042, %rd4011, 8; or.b64 %rd4043, %rd9741, 8; add.s64 %rd4044, %rd4011, 16; cvta.to.global.u64 %rd12688, %rd5233; mov.u64 %rd12689, %rd5233; $L__BB1_1658: mov.u64 %rd4048, %rd12689; mov.u64 %rd4047, %rd12688; add.s64 %rd4049, %rd4049, -1; setp.eq.s64 %p2270, %rd4048, 0; @%p2270 bra $L__BB1_1892; add.s64 %rd4050, %rd4047, 272; ld.global.u32 %r4023, [%rd4047+272]; mov.u64 %rd9757, 0; setp.eq.s32 %p2271, %r4023, 3; mov.u32 %r4022, 2; @%p2271 bra $L__BB1_1889; ld.global.u16 %rs809, [%rd4050+-272]; setp.eq.s16 %p2272, %rs809, 1; @%p2272 bra $L__BB1_1831; setp.eq.s16 %p2273, %rs809, 2; @%p2273 bra $L__BB1_1720; setp.ne.s16 %p2274, %rs809, 3; @%p2274 bra $L__BB1_1869; ld.global.u8 %rs163, [%rd4050+-248]; ld.global.f32 %f1210, [%rd4050+-16]; sub.f32 %f4386, %f1208, %f1210; ld.global.f32 %f1211, [%rd4050+-12]; sub.f32 %f4387, %f1209, %f1211; ld.global.f32 %f1212, [%rd4050+-20]; ld.global.f32 %f1213, [%rd4050+-24]; mul.f32 %f4388, %f4387, %f1212; fma.rn.f32 %f1214, %f4386, %f1213, %f4388; mul.f32 %f4389, %f4386, %f1212; mul.f32 %f4390, %f4387, %f1213; sub.f32 %f1215, %f4390, %f4389; mov.u32 %r1282, 2; st.local.u32 [%rd4015+20], %r1282; ld.global.u64 %rd4052, [%rd4050+-256]; setp.eq.s64 %p2276, %rd4052, 0; mov.pred %p2952, -1; @%p2276 bra $L__BB1_1717; mov.b32 %r4039, %f1215; ld.global.u64 %rd4053, [%rd4050+-264]; mov.b32 %r4040, %f1214; and.b32 %r4041, %r4040, 2147483647; mov.b32 %f1216, %r4041; and.b32 %r4042, %r4039, 2147483647; mov.b32 %f1217, %r4042; mov.u64 %rd12693, 1; bra.uni $L__BB1_1665; $L__BB1_1673: sub.f32 %f4402, %f5457, %f1214; abs.f32 %f1232, %f4402; setp.le.f32 %p2286, %f1232, 0f34000000; @%p2286 bra $L__BB1_1675; abs.f32 %f4403, %f5457; abs.f32 %f4404, %f1214; setp.gt.f32 %p2288, %f4404, %f4403; selp.f32 %f4405, %f4404, %f4403, %p2288; mul.f32 %f4406, %f4405, 0f34000000; setp.gtu.f32 %p2289, %f1232, %f4406; @%p2289 bra $L__BB1_1679; bra.uni $L__BB1_1675; $L__BB1_1665: shl.b64 %rd9761, %rd12693, 3; add.s64 %rd9762, %rd4053, %rd9761; setp.eq.s64 %p2277, %rd12693, %rd4052; selp.b64 %rd9763, 0, %rd12693, %p2277; shl.b64 %rd9764, %rd9763, 3; add.s64 %rd9765, %rd4053, %rd9764; ld.u32 %rd9766, [%rd9765]; ld.u32 %rd9767, [%rd9765+4]; bfi.b64 %rd4060, %rd9767, %rd9766, 32, 32; ld.u32 %rd9768, [%rd9762+-8]; ld.u32 %rd9769, [%rd9762+-4]; bfi.b64 %rd4061, %rd9769, %rd9768, 32, 32; cvt.u32.u64 %r5292, %rd4061; mov.b32 %f5457, %r5292; shr.u64 %rd9770, %rd4061, 32; cvt.u32.u64 %r4046, %rd9770; mov.b32 %f1220, %r4046; cvt.u32.u64 %r1265, %rd4060; shr.u64 %rd9771, %rd4060, 32; cvt.u32.u64 %r4047, %rd9771; mov.b32 %f1221, %r1265; sub.f32 %f1222, %f1221, %f5457; mov.b32 %f4392, %r4047; sub.f32 %f1223, %f4392, %f1220; sub.f32 %f4393, %f1214, %f5457; sub.f32 %f4394, %f1215, %f1220; mul.f32 %f4395, %f1223, %f4394; fma.rn.f32 %f1224, %f1222, %f4393, %f4395; mul.f32 %f4396, %f1223, %f1223; fma.rn.f32 %f4397, %f1222, %f1222, %f4396; add.f32 %f1225, %f4397, 0f00000000; setp.gtu.f32 %p2278, %f1224, 0f00000000; mov.b64 {%r4048, %r5293}, %rd4061; mov.b64 {%r4049, %r1267}, %rd4060; @%p2278 bra $L__BB1_1667; bra.uni $L__BB1_1666; $L__BB1_1667: setp.ltu.f32 %p2279, %f1224, %f1225; @%p2279 bra $L__BB1_1669; bra.uni $L__BB1_1668; $L__BB1_1669: setp.eq.f32 %p2280, %f1225, 0f00000000; @%p2280 bra $L__BB1_1716; div.rn.f32 %f4398, %f1224, %f1225; mov.f32 %f4399, 0f3F800000; sub.f32 %f4400, %f4399, %f4398; mov.b32 %r5295, %f4400; mov.b32 %r5296, %f4398; fma.rn.f32 %f5457, %f1222, %f4398, %f5457; mov.b32 %r5292, %f5457; fma.rn.f32 %f5458, %f1223, %f4398, %f1220; mov.b32 %r5293, %f5458; mov.u32 %r5294, 1; bra.uni $L__BB1_1671; $L__BB1_1666: mov.b32 %f5458, %r5293; mov.u32 %r5294, 0; mov.u32 %r5295, %r5294; bra.uni $L__BB1_1671; $L__BB1_1668: mov.b32 %f5458, %r1267; mov.u32 %r5295, 1; mov.u32 %r5294, 0; mov.f32 %f5457, %f1221; mov.u32 %r5292, %r1265; mov.u32 %r5293, %r1267; $L__BB1_1671: setp.eq.f32 %p2281, %f1214, %f5457; @%p2281 bra $L__BB1_1675; bra.uni $L__BB1_1672; $L__BB1_1675: setp.eq.f32 %p2291, %f5458, %f1215; mov.pred %p2290, -1; mov.pred %p2950, %p2290; @%p2291 bra $L__BB1_1679; setp.eq.f32 %p2293, %f1217, 0f7F800000; and.b32 %r4058, %r5293, 2147483647; mov.b32 %f4407, %r4058; setp.eq.f32 %p2294, %f4407, 0f7F800000; or.pred %p2295, %p2293, %p2294; mov.pred %p2950, 0; @%p2295 bra $L__BB1_1679; sub.f32 %f4408, %f5458, %f1215; abs.f32 %f1233, %f4408; setp.le.f32 %p2297, %f1233, 0f34000000; mov.pred %p2950, %p2290; @%p2297 bra $L__BB1_1679; abs.f32 %f4409, %f5458; abs.f32 %f4410, %f1215; setp.gt.f32 %p2298, %f4410, %f4409; selp.f32 %f4411, %f4410, %f4409, %p2298; mul.f32 %f4412, %f4411, 0f34000000; setp.le.f32 %p2950, %f1233, %f4412; bra.uni $L__BB1_1679; $L__BB1_1672: setp.eq.f32 %p2283, %f1216, 0f7F800000; and.b32 %r4057, %r5292, 2147483647; mov.b32 %f4401, %r4057; setp.eq.f32 %p2284, %f4401, 0f7F800000; or.pred %p2285, %p2283, %p2284; mov.pred %p2950, 0; @%p2285 bra $L__BB1_1679; bra.uni $L__BB1_1673; $L__BB1_1679: cvt.u64.u32 %rd9772, %r5293; cvt.u64.u32 %rd9773, %r5292; bfi.b64 %rd4062, %rd9772, %rd9773, 32, 32; mov.b64 {%r4059, %r4060}, %rd4062; selp.u64 %rd4063, 1, 0, %p2950; mov.b32 %f1235, %r4060; mov.b32 %f1234, %r4059; sub.f32 %f4413, %f1234, %f1214; sub.f32 %f4414, %f1235, %f1215; mul.f32 %f4415, %f4414, %f4414; fma.rn.f32 %f4416, %f4413, %f4413, %f4415; add.f32 %f4417, %f4416, 0f00000000; sqrt.rn.f32 %f1237, %f4417; setp.geu.f32 %p2299, %f1237, %f5459; setp.ne.s32 %p2300, %r1282, 2; and.pred %p2301, %p2300, %p2299; @%p2301 bra $L__BB1_1681; add.s64 %rd12694, %rd12693, -1; st.local.u64 [%rd4015], %rd12694; st.local.v2.f32 [%rd4015+8], {%f1234, %f1235}; mov.b64 {%r4063, %r4064}, %rd4063; st.local.v2.u32 [%rd4015+16], {%r4063, %r5294}; st.local.v2.u32 [%rd4015+24], {%r5295, %r5296}; st.local.f32 [%rd4015+32], %f1237; st.local.u32 [%rd4015+36], %rd4061; st.local.u32 [%rd4015+44], %rd4060; st.local.u32 [%rd4015+40], %rd9770; st.local.u32 [%rd4015+48], %rd9771; mov.u32 %r5297, %r5295; mov.u64 %rd12695, %rd4061; mov.u64 %rd12696, %rd4060; mov.f32 %f5459, %f1237; mov.u32 %r1282, %r5294; $L__BB1_1681: add.s64 %rd4068, %rd12693, 1; setp.lt.u64 %p2302, %rd12693, %rd4052; mov.u64 %rd12693, %rd4068; @%p2302 bra $L__BB1_1665; cvt.u32.u64 %r4065, %rd12695; mov.b32 %f4418, %r4065; shr.u64 %rd9780, %rd12695, 32; cvt.u32.u64 %r4066, %rd9780; mov.b32 %f4419, %r4066; shr.u64 %rd9781, %rd12696, 32; cvt.u32.u64 %r4067, %rd9781; cvt.u32.u64 %r4068, %rd12696; mov.b32 %f4420, %r4068; sub.f32 %f1239, %f4420, %f4418; mov.b32 %f4421, %r4067; sub.f32 %f1240, %f4421, %f4419; mul.f32 %f4422, %f1240, %f1240; fma.rn.f32 %f4423, %f1239, %f1239, %f4422; add.f32 %f1241, %f4423, 0f00000000; setp.leu.f32 %p2303, %f1241, 0f28800000; mov.u64 %rd9779, 0; mov.u64 %rd12697, %rd9779; mov.u64 %rd12698, %rd9779; mov.u64 %rd12699, %rd9779; @%p2303 bra $L__BB1_1684; neg.f32 %f4424, %f1239; sqrt.rn.f32 %f4425, %f1241; div.rn.f32 %f4426, %f1240, %f4425; div.rn.f32 %f4427, %f4424, %f4425; mov.b32 %r4069, %f4427; mov.b32 %r4070, %f4426; mov.u64 %rd12699, 1; mov.b64 %rd9784, {%r4070, %r4069}; shr.u64 %rd12698, %rd9784, 32; shl.b64 %rd12697, %rd9784, 32; $L__BB1_1684: or.b64 %rd4075, %rd12699, %rd12697; or.b64 %rd4076, %rd9779, %rd12698; and.b64 %rd9785, %rd9779, 4294967295; xor.b64 %rd9786, %rd12699, 1; or.b64 %rd9787, %rd9786, %rd9785; setp.ne.s64 %p2304, %rd9787, 0; @%p2304 bra $L__BB1_1715; mov.b64 {%r4071, %r4072}, %rd4076; mov.b64 {%r4073, %r4074}, %rd4075; mov.b32 %f1242, %r4074; mov.b32 %f1243, %r4071; setp.eq.s32 %p2305, %r1282, 1; @%p2305 bra $L__BB1_1713; bra.uni $L__BB1_1686; $L__BB1_1713: ld.local.u64 %rd9864, [%rd4015+8]; cvt.u32.u64 %r4094, %rd9864; mov.b32 %f4455, %r4094; shr.u64 %rd9865, %rd9864, 32; cvt.u32.u64 %r4095, %rd9865; mov.b32 %f4456, %r4095; sub.f32 %f4457, %f1208, %f4455; sub.f32 %f4458, %f1209, %f4456; mul.f32 %f4459, %f1243, %f4458; fma.rn.f32 %f4460, %f1242, %f4457, %f4459; setp.le.f32 %p2951, %f4460, 0f00000000; bra.uni $L__BB1_1714; $L__BB1_1720: ld.global.f32 %f1251, [%rd4050+-16]; mov.u64 %rd9885, 0; sub.f32 %f4470, %f1208, %f1251; ld.global.f32 %f1252, [%rd4050+-12]; sub.f32 %f4471, %f1209, %f1252; ld.global.f32 %f1253, [%rd4050+-20]; ld.global.f32 %f1254, [%rd4050+-24]; mul.f32 %f4472, %f4471, %f1253; fma.rn.f32 %f1255, %f4470, %f1254, %f4472; mul.f32 %f4473, %f4470, %f1253; mul.f32 %f4474, %f4471, %f1254; sub.f32 %f1256, %f4474, %f4473; mov.b32 %r4103, %f1255; mov.b32 %r4104, %f1256; cvt.u64.u32 %rd9886, %r4104; cvt.u64.u32 %rd9887, %r4103; bfi.b64 %rd9888, %rd9886, %rd9887, 32, 32; st.local.u64 [%rd4031], %rd9888; ld.global.u64 %rd4178, [%rd4050+-240]; setp.eq.s64 %p2325, %rd4178, 0; mov.u64 %rd9883, 2; mov.u64 %rd12853, %rd9885; mov.u64 %rd12854, %rd9883; mov.u64 %rd12855, %rd9885; @%p2325 bra $L__BB1_1826; mov.u32 %r4111, 0; st.local.u32 [%rd4015], %r4111; mov.u32 %r4112, -16777217; st.local.u32 [%rd4015+4], %r4112; mov.u32 %r1303, 1; st.local.u32 [%rd4015+512], %r1303; ld.global.u64 %rd4180, [%rd4050+-248]; ld.global.u64 %rd4181, [%rd4050+-192]; ld.global.u64 %rd4182, [%rd4050+-200]; mov.u32 %r1301, 2139095039; mov.u32 %r1300, 4; bra.uni $L__BB1_1722; $L__BB1_1831: ld.global.f32 %f1322, [%rd4050+-16]; sub.f32 %f4666, %f1208, %f1322; ld.global.f32 %f1323, [%rd4050+-12]; sub.f32 %f4667, %f1209, %f1323; ld.global.f32 %f1324, [%rd4050+-20]; ld.global.f32 %f1325, [%rd4050+-24]; mul.f32 %f4668, %f4667, %f1324; fma.rn.f32 %f1326, %f4666, %f1325, %f4668; mul.f32 %f4669, %f4666, %f1324; mul.f32 %f4670, %f4667, %f1325; sub.f32 %f1327, %f4670, %f4669; mov.b32 %r1404, %f1326; mov.b32 %r1405, %f1327; ld.global.v2.f32 {%f4671, %f4672}, [%rd4050+-216]; ld.global.v2.f32 {%f4673, %f4674}, [%rd4050+-224]; sub.f32 %f4675, %f1326, %f6; sub.f32 %f4676, %f1327, %f6; mov.b32 %r4283, %f4675; mov.b32 %r4284, %f4676; cvt.u64.u32 %rd10257, %r4284; cvt.u64.u32 %rd10258, %r4283; add.f32 %f4677, %f6, %f1326; add.f32 %f4678, %f6, %f1327; mov.b32 %r4285, %f4677; mov.b32 %r4286, %f4678; cvt.u64.u32 %rd10259, %r4286; cvt.u64.u32 %rd10260, %r4285; bfi.b64 %rd10261, %rd10257, %rd10258, 32, 32; mov.b64 {%r4287, %r4288}, %rd10261; bfi.b64 %rd10262, %rd10259, %rd10260, 32, 32; mov.b64 {%r4289, %r4290}, %rd10262; cvta.to.local.u64 %rd4541, %rd9745; mov.u16 %rs875, 2; st.local.u8 [%rd4541+8], %rs875; mov.b32 %f1335, %r4290; mov.b32 %f1333, %r4288; mov.b32 %f1334, %r4289; mov.b32 %f1332, %r4287; ld.global.v2.f32 {%f4679, %f4680}, [%rd4050+-232]; div.rn.f32 %f1338, %f1332, %f4679; div.rn.f32 %f1339, %f1334, %f4679; ld.global.u64 %rd4542, [%rd4050+-256]; cvt.rn.f32.u64 %f4681, %rd4542; add.f32 %f4682, %f4681, 0fBF800000; rcp.rn.f32 %f1340, %f4682; setp.lt.f32 %p2492, %f1339, 0fBF000000; setp.gt.f32 %p2493, %f1338, 0f3F000000; or.pred %p2494, %p2493, %p2492; @%p2494 bra $L__BB1_1863; add.f32 %f4683, %f1338, 0f3F000000; div.rn.f32 %f4684, %f4683, %f1340; cvt.rmi.f32.f32 %f4685, %f4684; add.s64 %rd10264, %rd4542, -2; cvt.rn.f32.u64 %f4686, %rd10264; setp.gt.f32 %p2495, %f4685, 0f00000000; setp.lt.f32 %p2496, %f4685, %f4686; selp.f32 %f4687, %f4685, %f4686, %p2496; selp.f32 %f4688, %f4687, 0f00000000, %p2495; setp.gt.f32 %p2497, %f4688, 0f5F7FFFFF; max.f32 %f4689, %f4688, 0f00000000; cvt.rzi.u64.f32 %rd10265, %f4689; selp.b64 %rd4548, -1, %rd10265, %p2497; add.f32 %f4690, %f1339, 0f3F000000; div.rn.f32 %f4691, %f4690, %f1340; cvt.rpi.f32.f32 %f4692, %f4691; add.s64 %rd10266, %rd4542, -1; cvt.rn.f32.u64 %f4693, %rd10266; setp.gt.f32 %p2498, %f4692, 0f00000000; setp.lt.f32 %p2499, %f4692, %f4693; selp.f32 %f4694, %f4692, %f4693, %p2499; selp.f32 %f4695, %f4694, 0f00000000, %p2498; setp.gt.f32 %p2500, %f4695, 0f5F7FFFFF; max.f32 %f4696, %f4695, 0f00000000; cvt.rzi.u64.f32 %rd10267, %f4696; selp.b64 %rd4544, -1, %rd10267, %p2500; setp.ge.u64 %p2501, %rd4548, %rd4544; @%p2501 bra $L__BB1_1863; div.rn.f32 %f1341, %f1333, %f4680; div.rn.f32 %f1342, %f1335, %f4680; ld.global.u64 %rd4545, [%rd4050+-240]; ld.global.u64 %rd4546, [%rd4050+-248]; ld.global.u64 %rd4547, [%rd4050+-264]; and.b32 %r4291, %r1404, 2147483647; mov.b32 %f1343, %r4291; and.b32 %r4292, %r1405, 2147483647; mov.b32 %f1344, %r4292; ld.local.v4.u32 {%r5357, %r5358, %r5359, %r4296}, [%rd4541]; mov.f32 %f5471, 0f7F7FFFFF; bra.uni $L__BB1_1834; $L__BB1_1869: ld.global.f32 %f1369, [%rd4050+-16]; sub.f32 %f4735, %f1208, %f1369; ld.global.f32 %f1370, [%rd4050+-12]; sub.f32 %f4736, %f1209, %f1370; ld.global.f32 %f1371, [%rd4050+-20]; ld.global.f32 %f1372, [%rd4050+-24]; mul.f32 %f4737, %f4736, %f1371; fma.rn.f32 %f1373, %f4735, %f1372, %f4737; mul.f32 %f4738, %f4735, %f1371; mul.f32 %f4739, %f4736, %f1372; sub.f32 %f1374, %f4739, %f4738; ld.global.u32 %rd10294, [%rd4050+-264]; ld.global.u32 %rd10295, [%rd4050+-260]; bfi.b64 %rd10296, %rd10295, %rd10294, 32, 32; cvt.u32.u64 %r4331, %rd10296; mov.b32 %f4740, %r4331; shr.u64 %rd10297, %rd10296, 32; cvt.u32.u64 %r4332, %rd10297; mov.b32 %f4741, %r4332; neg.f32 %f4742, %f4740; neg.f32 %f4743, %f4741; sub.f32 %f1375, %f4742, %f1373; sub.f32 %f1376, %f4743, %f1374; sub.f32 %f1377, %f1373, %f4740; sub.f32 %f1378, %f1374, %f4741; setp.ge.f32 %p2550, %f1375, 0f00000000; selp.f32 %f4744, %f1375, 0f00000000, %p2550; setp.ge.f32 %p2551, %f1376, 0f00000000; selp.f32 %f4745, %f1376, 0f00000000, %p2551; setp.ge.f32 %p2552, %f1377, 0f00000000; selp.f32 %f4746, %f1377, 0f00000000, %p2552; setp.ge.f32 %p2553, %f1378, 0f00000000; selp.f32 %f4747, %f1378, 0f00000000, %p2553; sub.f32 %f1379, %f4744, %f4746; mov.b32 %r4333, %f1379; sub.f32 %f1380, %f4745, %f4747; mov.b32 %r4334, %f1380; cvt.u64.u32 %rd10298, %r4334; cvt.u64.u32 %rd10299, %r4333; bfi.b64 %rd10300, %rd10298, %rd10299, 32, 32; st.local.u64 [%rd4011], %rd10300; mov.u64 %rd12869, 2; mov.u64 %rd12862, %rd4017; mov.u64 %rd12863, %rd4011; mov.u64 %rd12864, %rd4011; mov.u64 %rd12865, %rd9741; mov.u64 %rd12866, %rd4011; mov.u64 %rd12867, %rd4011; mov.u64 %rd12868, %rd9741; $L__BB1_1870: setp.eq.s64 %p2554, %rd12869, 0; @%p2554 bra $L__BB1_1873; add.s64 %rd12869, %rd12869, -1; add.s64 %rd10301, %rd12866, 8; setp.eq.s64 %p2555, %rd12866, %rd12862; selp.b64 %rd12862, %rd10301, %rd12862, %p2555; add.s64 %rd10302, %rd12863, 8; selp.b64 %rd12863, %rd10302, %rd12863, %p2555; add.s64 %rd10303, %rd12864, 8; selp.b64 %rd12864, %rd10303, %rd12864, %p2555; add.s64 %rd10304, %rd12865, 8; selp.b64 %rd12865, %rd10304, %rd12865, %p2555; selp.b64 %rd10305, %rd10302, %rd12866, %p2555; selp.b64 %rd10306, %rd10303, %rd12867, %p2555; selp.b64 %rd10307, %rd10304, %rd12868, %p2555; setp.eq.s64 %p2556, %rd12869, 0; add.s64 %rd10308, %rd10305, 4; add.s64 %rd10309, %rd10306, 4; add.s64 %rd10310, %rd10307, 4; selp.b64 %rd12866, %rd10305, %rd10308, %p2556; selp.b64 %rd12867, %rd10306, %rd10309, %p2556; selp.b64 %rd12868, %rd10307, %rd10310, %p2556; ld.local.f32 %f4748, [%rd10306]; setp.eq.f32 %p2557, %f4748, 0f00000000; @%p2557 bra $L__BB1_1870; add.f32 %f4749, %f1373, %f1379; mov.b32 %r4335, %f4749; add.f32 %f4750, %f1374, %f1380; mov.b32 %r4336, %f4750; cvt.u64.u32 %rd10313, %r4336; cvt.u64.u32 %rd10314, %r4335; bfi.b64 %rd12873, %rd10313, %rd10314, 32, 32; mov.u64 %rd12872, 0; bra.uni $L__BB1_1886; $L__BB1_1873: setp.lt.f32 %p2558, %f1375, %f1377; mov.f32 %f5472, 0fFF7FFFFF; @%p2558 bra $L__BB1_1876; bra.uni $L__BB1_1874; $L__BB1_1876: setp.leu.f32 %p2563, %f1377, 0fFF7FFFFF; mov.pred %p2956, 0; @%p2563 bra $L__BB1_1878; mov.f32 %f5472, %f1377; bra.uni $L__BB1_1878; $L__BB1_1874: setp.leu.f32 %p2560, %f1375, 0fFF7FFFFF; mov.pred %p2956, 0; @%p2560 bra $L__BB1_1878; mov.pred %p2956, -1; mov.f32 %f5472, %f1375; $L__BB1_1878: setp.lt.f32 %p2565, %f1376, %f1378; @%p2565 bra $L__BB1_1881; bra.uni $L__BB1_1879; $L__BB1_1881: setp.gt.f32 %p2567, %f1378, %f5472; @%p2567 bra $L__BB1_1884; bra.uni $L__BB1_1882; $L__BB1_1884: mov.u64 %rd10317, 0; st.local.u64 [%rd4015], %rd10317; neg.f32 %f5474, %f1378; mov.u64 %rd12871, %rd4036; bra.uni $L__BB1_1885; $L__BB1_1879: setp.leu.f32 %p2566, %f1376, %f5472; @%p2566 bra $L__BB1_1882; mov.u64 %rd10315, 0; st.local.u64 [%rd4015], %rd10315; mov.u64 %rd12871, %rd4036; mov.f32 %f5472, %f1376; bra.uni $L__BB1_1883; $L__BB1_1882: mov.u64 %rd10316, 0; st.local.u64 [%rd4015], %rd10316; neg.f32 %f5474, %f5472; not.pred %p2568, %p2956; mov.u64 %rd12871, %rd4015; @%p2568 bra $L__BB1_1885; $L__BB1_1883: mov.f32 %f5474, %f5472; $L__BB1_1885: st.local.f32 [%rd12871], %f5474; ld.local.u64 %rd10320, [%rd4015]; cvt.u32.u64 %r4337, %rd10320; mov.b32 %f4753, %r4337; shr.u64 %rd10321, %rd10320, 32; cvt.u32.u64 %r4338, %rd10321; mov.b32 %f4754, %r4338; add.f32 %f4755, %f1373, %f4753; add.f32 %f4756, %f1374, %f4754; mov.b32 %r4339, %f4755; mov.b32 %r4340, %f4756; cvt.u64.u32 %rd10322, %r4340; cvt.u64.u32 %rd10323, %r4339; bfi.b64 %rd12873, %rd10322, %rd10323, 32, 32; mov.u64 %rd12872, 1; $L__BB1_1886: mov.u64 %rd11276, 0; cvt.u32.u64 %r4341, %rd12873; mov.b32 %f4757, %r4341; shr.u64 %rd10324, %rd12873, 32; cvt.u32.u64 %r4342, %rd10324; mov.b32 %f4758, %r4342; mul.f32 %f4759, %f1372, %f4757; mul.f32 %f4760, %f1371, %f4758; sub.f32 %f4761, %f4759, %f4760; mul.f32 %f4762, %f1372, %f4758; fma.rn.f32 %f4763, %f1371, %f4757, %f4762; add.f32 %f4764, %f1369, %f4761; mov.b32 %r4343, %f4764; add.f32 %f4765, %f1370, %f4763; mov.b32 %r4344, %f4765; cvt.u64.u32 %rd10325, %r4344; cvt.u64.u32 %rd10326, %r4343; bfi.b64 %rd10327, %rd10325, %rd10326, 32, 32; or.b64 %rd10328, %rd11276, %rd10327; mov.b64 {%r5360, %r5361}, %rd10328; mov.b64 {%r5362, %r4345}, %rd12872; bra.uni $L__BB1_1887; $L__BB1_1851: sub.f32 %f4709, %f5469, %f1326; abs.f32 %f1362, %f4709; setp.le.f32 %p2520, %f1362, 0f34000000; @%p2520 bra $L__BB1_1853; abs.f32 %f4710, %f5469; abs.f32 %f4711, %f1326; setp.gt.f32 %p2522, %f4711, %f4710; selp.f32 %f4712, %f4711, %f4710, %p2522; mul.f32 %f4713, %f4712, 0f34000000; setp.gtu.f32 %p2523, %f1362, %f4713; @%p2523 bra $L__BB1_1857; bra.uni $L__BB1_1853; $L__BB1_1834: setp.gt.u64 %p2502, %rd4545, %rd4548; @%p2502 bra $L__BB1_1836; bra.uni $L__BB1_1835; $L__BB1_1836: add.s64 %rd10268, %rd4546, %rd4548; ld.u8 %rs876, [%rd10268]; setp.eq.s16 %p2503, %rs876, 0; @%p2503 bra $L__BB1_1861; cvt.rn.f32.u64 %f4698, %rd4548; fma.rn.f32 %f1346, %f1340, %f4698, 0fBF000000; setp.gt.u64 %p2504, %rd4542, %rd4548; @%p2504 bra $L__BB1_1839; bra.uni $L__BB1_1838; $L__BB1_1839: shl.b64 %rd10269, %rd4548, 2; add.s64 %rd4549, %rd4547, %rd10269; ld.f32 %f1347, [%rd4549]; add.s64 %rd10270, %rd4548, 1; setp.gt.u64 %p2505, %rd4542, %rd10270; @%p2505 bra $L__BB1_1841; bra.uni $L__BB1_1840; $L__BB1_1841: ld.f32 %f1348, [%rd4549+4]; setp.gt.f32 %p2506, %f1348, %f1342; setp.gt.f32 %p2507, %f1347, %f1342; and.pred %p2508, %p2507, %p2506; @%p2508 bra $L__BB1_1861; setp.lt.f32 %p2509, %f1347, %f1341; setp.lt.f32 %p2510, %f1348, %f1341; and.pred %p2511, %p2509, %p2510; @%p2511 bra $L__BB1_1861; mul.f32 %f4699, %f4679, %f1346; mov.b32 %r4297, %f4699; mul.f32 %f1351, %f4680, %f1347; mov.b32 %r4298, %f1351; cvt.u64.u32 %rd10271, %r4298; cvt.u64.u32 %rd10272, %r4297; add.f32 %f4700, %f1340, %f1346; mul.f32 %f1349, %f4679, %f4700; mov.b32 %r1412, %f1349; mul.f32 %f4701, %f4680, %f1348; mov.b32 %r4299, %f4701; cvt.u64.u32 %rd10273, %r4299; cvt.u64.u32 %rd10274, %r1412; bfi.b64 %rd10275, %rd10273, %rd10274, 32, 32; bfi.b64 %rd10276, %rd10271, %rd10272, 32, 32; cvt.u32.u64 %r5355, %rd10276; mov.b32 %f5469, %r5355; sub.f32 %f1352, %f1349, %f5469; sub.f32 %f1353, %f4701, %f1351; sub.f32 %f4702, %f1326, %f5469; sub.f32 %f4703, %f1327, %f1351; mul.f32 %f4704, %f1353, %f4703; fma.rn.f32 %f1354, %f1352, %f4702, %f4704; mul.f32 %f4705, %f1353, %f1353; fma.rn.f32 %f4706, %f1352, %f1352, %f4705; add.f32 %f1355, %f4706, 0f00000000; setp.gtu.f32 %p2512, %f1354, 0f00000000; mov.b64 {%r4300, %r5356}, %rd10276; mov.b64 {%r4301, %r1415}, %rd10275; @%p2512 bra $L__BB1_1845; bra.uni $L__BB1_1844; $L__BB1_1845: setp.ltu.f32 %p2513, %f1354, %f1355; @%p2513 bra $L__BB1_1847; bra.uni $L__BB1_1846; $L__BB1_1847: setp.eq.f32 %p2514, %f1355, 0f00000000; @%p2514 bra $L__BB1_1860; div.rn.f32 %f4707, %f1354, %f1355; fma.rn.f32 %f5469, %f1352, %f4707, %f5469; mov.b32 %r5355, %f5469; fma.rn.f32 %f5470, %f1353, %f4707, %f1351; mov.b32 %r5356, %f5470; bra.uni $L__BB1_1849; $L__BB1_1844: mov.b32 %f5470, %r5356; bra.uni $L__BB1_1849; $L__BB1_1846: mov.b32 %f5470, %r1415; mov.f32 %f5469, %f1349; mov.u32 %r5355, %r1412; mov.u32 %r5356, %r1415; $L__BB1_1849: setp.eq.f32 %p2515, %f1326, %f5469; @%p2515 bra $L__BB1_1853; bra.uni $L__BB1_1850; $L__BB1_1853: setp.eq.f32 %p2525, %f5470, %f1327; mov.pred %p2524, -1; mov.pred %p2954, %p2524; @%p2525 bra $L__BB1_1857; setp.eq.f32 %p2527, %f1344, 0f7F800000; and.b32 %r4303, %r5356, 2147483647; mov.b32 %f4714, %r4303; setp.eq.f32 %p2528, %f4714, 0f7F800000; or.pred %p2529, %p2527, %p2528; mov.pred %p2954, 0; @%p2529 bra $L__BB1_1857; sub.f32 %f4715, %f5470, %f1327; abs.f32 %f1363, %f4715; setp.le.f32 %p2531, %f1363, 0f34000000; mov.pred %p2954, %p2524; @%p2531 bra $L__BB1_1857; abs.f32 %f4716, %f5470; abs.f32 %f4717, %f1327; setp.gt.f32 %p2532, %f4717, %f4716; selp.f32 %f4718, %f4717, %f4716, %p2532; mul.f32 %f4719, %f4718, 0f34000000; setp.le.f32 %p2954, %f1363, %f4719; bra.uni $L__BB1_1857; $L__BB1_1850: setp.eq.f32 %p2517, %f1343, 0f7F800000; and.b32 %r4302, %r5355, 2147483647; mov.b32 %f4708, %r4302; setp.eq.f32 %p2518, %f4708, 0f7F800000; or.pred %p2519, %p2517, %p2518; mov.pred %p2954, 0; @%p2519 bra $L__BB1_1857; bra.uni $L__BB1_1851; $L__BB1_1857: cvt.u64.u32 %rd10277, %r5356; cvt.u64.u32 %rd10278, %r5355; bfi.b64 %rd4550, %rd10277, %rd10278, 32, 32; mov.b64 {%r4304, %r4305}, %rd4550; selp.u64 %rd4551, 1, 0, %p2954; mov.b32 %f4720, %r4304; sub.f32 %f4721, %f4720, %f1326; mov.b32 %f4722, %r4305; sub.f32 %f4723, %f4722, %f1327; mul.f32 %f4724, %f4723, %f4723; fma.rn.f32 %f4725, %f4721, %f4721, %f4724; add.f32 %f1364, %f4725, 0f00000000; setp.geu.f32 %p2533, %f1364, %f5471; @%p2533 bra $L__BB1_1861; sqrt.rn.f32 %f4726, %f1364; setp.gtu.f32 %p2534, %f4726, %f6; mov.f32 %f5471, %f1364; @%p2534 bra $L__BB1_1861; mov.b64 {%r5359, %r4306}, %rd4551; mov.u32 %r5357, %r4304; mov.u32 %r5358, %r4305; mov.f32 %f5471, %f1364; $L__BB1_1861: add.s64 %rd4548, %rd4548, 1; setp.lt.u64 %p2535, %rd4548, %rd4544; @%p2535 bra $L__BB1_1834; st.local.u32 [%rd4541+8], %r5359; mov.b64 %rd10279, {%r5357, %r5358}; st.local.u64 [%rd4541], %rd10279; $L__BB1_1863: cvt.u64.u32 %rd10280, %r1404; cvt.u64.u32 %rd10281, %r1405; bfi.b64 %rd4553, %rd10281, %rd10280, 32, 32; ld.local.v4.u32 {%r4310, %r4311, %r4312, %r4313}, [%rd4541]; mov.b64 %rd4555, {%r4312, %r4313}; mov.b64 %rd4554, {%r4310, %r4311}; mov.b32 {%rs877, %rs878}, %r4312; and.b16 %rs879, %rs877, 255; setp.eq.s16 %p2536, %rs879, 2; cvt.u64.u16 %rd10282, %rs877; and.b64 %rd10283, %rd10282, 255; selp.b64 %rd10284, 2, %rd10283, %p2536; and.b64 %rd10285, %rd4555, 4294967040; or.b64 %rd10286, %rd10285, %rd10284; mov.b64 {%r4318, %r4319}, %rd10286; mov.b32 {%rs1039, %rs880}, %r4318; and.b16 %rs881, %rs1039, 255; setp.eq.s16 %p2537, %rs881, 2; mov.u32 %r5362, 2; mov.u32 %r5360, 0; mov.u32 %r5361, %r5360; @%p2537 bra $L__BB1_1887; ld.global.u8 %rs882, [%rd4050+-208]; setp.eq.s16 %p2538, %rs882, 0; shr.u64 %rd10287, %rd4554, 32; cvt.u32.u64 %r4320, %rd10287; mov.b32 %f1366, %r4320; @%p2538 bra $L__BB1_1868; mov.b64 {%r4321, %r4322}, %rd4553; mov.b32 %f1368, %r4322; mov.b32 %f1367, %r4321; ld.global.u8 %rs181, [%rd4050+-207]; setp.gt.f32 %p2540, %f1367, %f4671; setp.lt.f32 %p2541, %f1367, %f4673; or.pred %p2542, %p2541, %p2540; mov.pred %p2955, 0; @%p2542 bra $L__BB1_1867; setp.geu.f32 %p2543, %f1368, 0fFF7FFFFF; setp.leu.f32 %p2544, %f1368, 0f7F7FFFFF; and.pred %p2955, %p2544, %p2543; $L__BB1_1867: setp.ge.f32 %p2545, %f1327, %f1366; setp.le.f32 %p2546, %f1327, %f1366; setp.eq.s16 %p2547, %rs181, 0; selp.u32 %r4323, -1, 0, %p2545; selp.u32 %r4324, -1, 0, %p2546; selp.b32 %r4325, %r4324, %r4323, %p2547; and.b32 %r4326, %r4325, 1; setp.eq.b32 %p2548, %r4326, 1; and.pred %p2549, %p2548, %p2955; selp.u16 %rs1039, 1, 0, %p2549; $L__BB1_1868: cvt.u32.u64 %r4327, %rd4554; mov.b32 %f4727, %r4327; mul.f32 %f4728, %f1325, %f4727; mul.f32 %f4729, %f1324, %f1366; sub.f32 %f4730, %f4728, %f4729; mul.f32 %f4731, %f1325, %f1366; fma.rn.f32 %f4732, %f1324, %f4727, %f4731; add.f32 %f4733, %f1322, %f4730; mov.b32 %r4328, %f4733; add.f32 %f4734, %f1323, %f4732; mov.b32 %r4329, %f4734; cvt.u64.u32 %rd10288, %r4329; cvt.u64.u32 %rd10289, %r4328; cvt.u64.u16 %rd10290, %rs1039; bfi.b64 %rd10291, %rd10288, %rd10289, 32, 32; and.b64 %rd10292, %rd10290, 255; mov.b64 {%r5360, %r5361}, %rd10291; mov.b64 {%r5362, %r4330}, %rd10292; bra.uni $L__BB1_1887; $L__BB1_1686: setp.eq.s32 %p2306, %r5297, 0; @%p2306 bra $L__BB1_1699; setp.ne.s32 %p2307, %r5297, 1; @%p2307 bra $L__BB1_1712; add.s64 %rd4077, %rd12694, 1; or.b64 %rd9788, %rd4077, %rd4052; and.b64 %rd9789, %rd9788, -4294967296; setp.eq.s64 %p2308, %rd9789, 0; @%p2308 bra $L__BB1_1690; rem.u64 %rd12700, %rd4077, %rd4052; bra.uni $L__BB1_1691; $L__BB1_1699: setp.eq.s64 %p2315, %rd12694, 0; selp.b64 %rd4124, %rd4052, %rd12694, %p2315; add.s64 %rd9828, %rd4124, -1; setp.gt.u64 %p2316, %rd4052, %rd9828; @%p2316 bra $L__BB1_1701; bra.uni $L__BB1_1700; $L__BB1_1701: shl.b64 %rd9829, %rd4124, 3; add.s64 %rd9830, %rd4053, %rd9829; ld.u32 %rd9831, [%rd9830+-8]; ld.u32 %rd9832, [%rd9830+-4]; bfi.b64 %rd4125, %rd9832, %rd9831, 32, 32; or.b64 %rd9833, %rd4124, %rd4052; and.b64 %rd9834, %rd9833, -4294967296; setp.eq.s64 %p2317, %rd9834, 0; @%p2317 bra $L__BB1_1703; rem.u64 %rd12717, %rd4124, %rd4052; bra.uni $L__BB1_1704; $L__BB1_1817: ld.u32 %r4260, [%rd4190+76]; cvt.u64.u32 %rd10198, %r4260; setp.le.u64 %p2482, %rd4181, %rd10198; mul.wide.u32 %rd10199, %r4260, 12; add.s64 %rd10200, %rd4182, %rd10199; setp.eq.s64 %p2483, %rd10200, 0; or.pred %p2484, %p2482, %p2483; selp.b32 %r1298, %r1298, %r5316, %p2484; selp.b32 %r1297, %r1297, %r5315, %p2484; selp.b32 %r1296, %r1296, %r5314, %p2484; selp.b32 %r1300, %r1300, %r5329, %p2484; selp.b32 %r1301, %r1301, %r1350, %p2484; $L__BB1_1722: mov.b32 %f1257, %r1301; $L__BB1_1723: mov.u32 %r1302, %r1303; setp.eq.s32 %p2326, %r1302, 0; @%p2326 bra $L__BB1_1824; cvt.u64.u32 %rd9890, %r1302; add.s64 %rd9891, %rd9890, -1; cvt.u32.u64 %r1303, %rd9891; st.local.u32 [%rd4015+512], %r1303; mul.wide.u32 %rd9892, %r1302, 8; add.s64 %rd9893, %rd4015, %rd9892; ld.local.u32 %rd4188, [%rd9893+-4]; ld.local.u32 %rd9894, [%rd9893+-8]; shl.b64 %rd9895, %rd9894, 32; or.b64 %rd4187, %rd9895, 1; mov.b64 {%r4116, %r4117}, %rd4188; mov.b32 %f4475, %r4116; neg.f32 %f4476, %f4475; setp.le.f32 %p2327, %f1257, %f4476; @%p2327 bra $L__BB1_1723; mov.b64 {%r4118, %r4119}, %rd4187; cvt.u64.u32 %rd4189, %r4119; setp.gt.u64 %p2328, %rd4178, %rd4189; @%p2328 bra $L__BB1_1727; bra.uni $L__BB1_1726; $L__BB1_1727: mul.lo.s64 %rd9896, %rd4189, 96; add.s64 %rd4190, %rd4180, %rd9896; ld.u8 %rs817, [%rd4190+88]; and.b16 %rs818, %rs817, 1; setp.eq.b16 %p2330, %rs818, 1; mov.pred %p2953, 0; xor.pred %p2331, %p2330, %p2953; not.pred %p2332, %p2331; @%p2332 bra $L__BB1_1729; ld.v4.u32 {%r4120, %r4121, %r4122, %r4123}, [%rd4190+64]; cvt.u64.u32 %rd9897, %r4120; setp.gt.u64 %p2334, %rd4181, %rd9897; mul.wide.u32 %rd9898, %r4120, 12; add.s64 %rd9899, %rd4182, %rd9898; selp.b64 %rd9900, %rd9899, 0, %p2334; setp.eq.s64 %p2335, %rd9900, 0; add.s64 %rd9901, %rd9900, 8; selp.b64 %rd12738, 0, %rd9901, %p2335; cvt.u64.u32 %rd9902, %r4121; setp.gt.u64 %p2336, %rd4181, %rd9902; mul.wide.u32 %rd9903, %r4121, 12; add.s64 %rd9904, %rd4182, %rd9903; selp.b64 %rd9905, %rd9904, 0, %p2336; setp.eq.s64 %p2337, %rd9905, 0; add.s64 %rd9906, %rd9905, 8; selp.b64 %rd12737, 0, %rd9906, %p2337; ld.u32 %r4127, [%rd4190+72]; cvt.u64.u32 %rd9907, %r4127; setp.gt.u64 %p2338, %rd4181, %rd9907; mul.wide.u32 %rd9908, %r4127, 12; add.s64 %rd9909, %rd4182, %rd9908; selp.b64 %rd9910, %rd9909, 0, %p2338; setp.eq.s64 %p2339, %rd9910, 0; add.s64 %rd9911, %rd9910, 8; selp.b64 %rd12736, 0, %rd9911, %p2339; cvt.u64.u32 %rd9912, %r4123; setp.gt.u64 %p2340, %rd4181, %rd9912; mul.wide.u32 %rd9913, %r4123, 12; add.s64 %rd9914, %rd4182, %rd9913; selp.b64 %rd9915, %rd9914, 0, %p2340; setp.eq.s64 %p2341, %rd9915, 0; add.s64 %rd9916, %rd9915, 8; selp.b64 %rd12735, 0, %rd9916, %p2341; mov.pred %p2953, -1; $L__BB1_1729: ld.v4.f32 {%f4477, %f4478, %f4479, %f4480}, [%rd4190]; sub.f32 %f4485, %f4477, %f1255; sub.f32 %f4486, %f4478, %f1255; sub.f32 %f4487, %f4479, %f1255; sub.f32 %f4488, %f4480, %f1255; ld.v4.f32 {%f4489, %f4490, %f4491, %f4492}, [%rd4190+16]; sub.f32 %f4497, %f4489, %f1256; sub.f32 %f4498, %f4490, %f1256; sub.f32 %f4499, %f4491, %f1256; sub.f32 %f4500, %f4492, %f1256; ld.v4.f32 {%f4501, %f4502, %f4503, %f4504}, [%rd4190+32]; sub.f32 %f4509, %f1255, %f4501; sub.f32 %f4510, %f1255, %f4502; sub.f32 %f4511, %f1255, %f4503; sub.f32 %f4512, %f1255, %f4504; ld.v4.f32 {%f4513, %f4514, %f4515, %f4516}, [%rd4190+48]; sub.f32 %f4521, %f1256, %f4513; sub.f32 %f4522, %f1256, %f4514; sub.f32 %f4523, %f1256, %f4515; sub.f32 %f4524, %f1256, %f4516; setp.ge.f32 %p2342, %f4485, %f4509; selp.f32 %f4525, %f4485, %f4509, %p2342; setp.ge.f32 %p2343, %f4486, %f4510; selp.f32 %f4526, %f4486, %f4510, %p2343; setp.ge.f32 %p2344, %f4487, %f4511; selp.f32 %f4527, %f4487, %f4511, %p2344; setp.ge.f32 %p2345, %f4488, %f4512; selp.f32 %f4528, %f4488, %f4512, %p2345; setp.ge.f32 %p2346, %f4497, %f4521; selp.f32 %f4529, %f4497, %f4521, %p2346; setp.ge.f32 %p2347, %f4498, %f4522; selp.f32 %f4530, %f4498, %f4522, %p2347; setp.ge.f32 %p2348, %f4499, %f4523; selp.f32 %f4531, %f4499, %f4523, %p2348; setp.ge.f32 %p2349, %f4500, %f4524; selp.f32 %f4532, %f4500, %f4524, %p2349; setp.ge.f32 %p2350, %f4525, 0f00000000; selp.f32 %f4533, %f4525, 0f00000000, %p2350; setp.ge.f32 %p2351, %f4526, 0f00000000; selp.f32 %f4534, %f4526, 0f00000000, %p2351; setp.ge.f32 %p2352, %f4527, 0f00000000; selp.f32 %f4535, %f4527, 0f00000000, %p2352; setp.ge.f32 %p2353, %f4528, 0f00000000; selp.f32 %f4536, %f4528, 0f00000000, %p2353; mov.b32 %r4128, %f4533; mov.b32 %r4129, %f4534; mov.b32 %r4130, %f4535; mov.b32 %r4131, %f4536; cvt.u64.u32 %rd9917, %r4131; cvt.u64.u32 %rd9918, %r4129; cvt.u64.u32 %rd9919, %r4128; cvt.u64.u32 %rd9920, %r4130; bfi.b64 %rd9921, %rd9917, %rd9920, 32, 32; bfi.b64 %rd9922, %rd9918, %rd9919, 32, 32; setp.ge.f32 %p2354, %f4529, 0f00000000; selp.f32 %f4537, %f4529, 0f00000000, %p2354; setp.ge.f32 %p2355, %f4530, 0f00000000; selp.f32 %f4538, %f4530, 0f00000000, %p2355; setp.ge.f32 %p2356, %f4531, 0f00000000; selp.f32 %f4539, %f4531, 0f00000000, %p2356; setp.ge.f32 %p2357, %f4532, 0f00000000; selp.f32 %f4540, %f4532, 0f00000000, %p2357; mov.b32 %r4132, %f4537; mov.b32 %r4133, %f4538; mov.b32 %r4134, %f4539; mov.b32 %r4135, %f4540; cvt.u64.u32 %rd9923, %r4135; cvt.u64.u32 %rd9924, %r4133; cvt.u64.u32 %rd9925, %r4132; cvt.u64.u32 %rd9926, %r4134; bfi.b64 %rd9927, %rd9923, %rd9926, 32, 32; bfi.b64 %rd9928, %rd9924, %rd9925, 32, 32; mov.b64 {%r4136, %r4137}, %rd9922; mov.b64 {%r4138, %r4139}, %rd9921; cvt.u64.u32 %rd9929, %r4139; cvt.u64.u32 %rd9930, %r4137; cvt.u64.u32 %rd9931, %r4138; bfi.b64 %rd9932, %rd9929, %rd9931, 32, 32; mov.b64 {%r4140, %r4141}, %rd9932; bfi.b64 %rd9933, %rd9930, %rd9919, 32, 32; mov.b64 {%r4142, %r4143}, %rd9933; mov.b32 %f4541, %r4142; mov.b32 %f4542, %r4143; mov.b32 %f4543, %r4140; mov.b32 %f4544, %r4141; mov.b32 %f4545, %r4136; mov.b32 %f4546, %r4137; mov.b32 %f4547, %r4138; mov.b32 %f4548, %r4139; mov.b64 {%r4144, %r4145}, %rd9928; mov.b64 {%r4146, %r4147}, %rd9927; cvt.u64.u32 %rd9934, %r4147; cvt.u64.u32 %rd9935, %r4145; cvt.u64.u32 %rd9936, %r4146; bfi.b64 %rd9937, %rd9934, %rd9936, 32, 32; mov.b64 {%r4148, %r4149}, %rd9937; bfi.b64 %rd9938, %rd9935, %rd9925, 32, 32; mov.b64 {%r4150, %r4151}, %rd9938; mov.b32 %f4549, %r4150; mov.b32 %f4550, %r4151; mov.b32 %f4551, %r4148; mov.b32 %f4552, %r4149; mov.b32 %f4553, %r4144; mov.b32 %f4554, %r4145; mov.b32 %f4555, %r4146; mov.b32 %f4556, %r4147; mul.f32 %f4557, %f4553, %f4549; mul.f32 %f4558, %f4554, %f4550; mul.f32 %f4559, %f4555, %f4551; mul.f32 %f4560, %f4556, %f4552; fma.rn.f32 %f4561, %f4545, %f4541, %f4557; fma.rn.f32 %f4562, %f4546, %f4542, %f4558; fma.rn.f32 %f4563, %f4547, %f4543, %f4559; fma.rn.f32 %f4564, %f4548, %f4544, %f4560; add.f32 %f4565, %f4561, 0f00000000; add.f32 %f4566, %f4562, 0f00000000; add.f32 %f4567, %f4563, 0f00000000; add.f32 %f4568, %f4564, 0f00000000; sqrt.rn.f32 %f4569, %f4565; sqrt.rn.f32 %f4570, %f4566; sqrt.rn.f32 %f4571, %f4567; sqrt.rn.f32 %f4572, %f4568; mov.b32 %r4152, %f4569; mov.b32 %r4153, %f4570; mov.b32 %r4154, %f4571; mov.b32 %r4155, %f4572; cvt.u64.u32 %rd9939, %r4155; cvt.u64.u32 %rd9940, %r4153; cvt.u64.u32 %rd9941, %r4152; cvt.u64.u32 %rd9942, %r4154; bfi.b64 %rd12844, %rd9939, %rd9942, 32, 32; mov.b64 {%r4156, %r4157}, %rd12844; bfi.b64 %rd12843, %rd9940, %rd9941, 32, 32; mov.b64 {%r4158, %r4159}, %rd12843; mov.b32 %f4573, %r4158; mov.b32 %f4574, %r4159; mov.b32 %f4575, %r4156; mov.b32 %f4576, %r4157; setp.lt.f32 %p2358, %f4573, %f1257; setp.lt.f32 %p2359, %f4574, %f1257; setp.lt.f32 %p2360, %f4575, %f1257; setp.lt.f32 %p2361, %f4576, %f1257; selp.u32 %r4160, 1, 0, %p2358; selp.u32 %r4161, -1, 0, %p2359; bfi.b32 %r4162, %r4161, %r4160, 8, 1; selp.u32 %r4163, -1, 0, %p2360; bfi.b32 %r4164, %r4163, %r4162, 16, 1; selp.u32 %r4165, -1, 0, %p2361; bfi.b32 %r4166, %r4165, %r4164, 24, 1; cvt.u64.u32 %rd9943, %r4166; mov.b64 {%r4167, %r4168}, %rd9943; mov.b32 {%rs819, %rs820}, %r4167; and.b16 %rs821, %rs819, 1; shr.u16 %rs822, %rs819, 7; and.b16 %rs823, %rs822, 2; or.b16 %rs824, %rs823, %rs821; shl.b16 %rs825, %rs820, 2; and.b16 %rs826, %rs825, 4; or.b16 %rs827, %rs824, %rs826; shr.u16 %rs828, %rs820, 5; and.b16 %rs829, %rs828, 8; or.b16 %rs830, %rs827, %rs829; cvt.u64.u16 %rd4201, %rs830; @%p2953 bra $L__BB1_1731; bra.uni $L__BB1_1730; $L__BB1_1731: mov.u64 %rd9944, 1; st.local.v2.u64 [%rd8], {%rd12738, %rd12737}; st.local.v2.u64 [%rd8+16], {%rd12736, %rd12735}; mov.f32 %f4577, 0f00000000; st.local.v4.f32 [%rd24], {%f4577, %f4577, %f4577, %f4577}; mov.u32 %r4179, 4; st.local.u32 [%rd4011+16], %r4179; st.local.u32 [%rd4011+52], %r4179; st.local.u32 [%rd4011+88], %r4179; st.local.u32 [%rd4011+124], %r4179; mov.u64 %rd4206, %rd9944; $L__BB1_1732: add.s64 %rd9945, %rd4206, -1; cvt.u32.u64 %r4180, %rd9945; shl.b64 %rd9947, %rd9944, %r4180; and.b64 %rd9948, %rd9947, %rd4201; setp.eq.s64 %p2362, %rd9948, 0; @%p2362 bra $L__BB1_1785; shl.b64 %rd9949, %rd4206, 3; add.s64 %rd9950, %rd8, %rd9949; ld.local.u64 %rd4207, [%rd9950+-8]; setp.eq.s64 %p2363, %rd4207, 0; @%p2363 bra $L__BB1_1785; ld.u32 %r1304, [%rd4207]; cvt.u64.u32 %rd4208, %r1304; ld.global.u64 %rd9951, [%rd4050+-160]; setp.gt.u64 %p2364, %rd9951, %rd4208; @%p2364 bra $L__BB1_1736; bra.uni $L__BB1_1735; $L__BB1_1736: ld.global.u64 %rd9952, [%rd4050+-168]; mul.lo.s64 %rd9953, %rd4208, 12; add.s64 %rd4209, %rd9952, %rd9953; ld.u32 %rd4210, [%rd4209+8]; ld.u32 %rd4211, [%rd4209]; ld.global.u64 %rd4212, [%rd4050+-176]; setp.gt.u64 %p2365, %rd4212, %rd4211; @%p2365 bra $L__BB1_1738; bra.uni $L__BB1_1737; $L__BB1_1738: ld.global.u64 %rd4213, [%rd4050+-184]; shl.b64 %rd9954, %rd4211, 3; add.s64 %rd9955, %rd4213, %rd9954; ld.u32 %rd9956, [%rd9955]; ld.u32 %rd9957, [%rd9955+4]; bfi.b64 %rd4214, %rd9957, %rd9956, 32, 32; ld.u32 %rd4215, [%rd4209+4]; setp.gt.u64 %p2366, %rd4212, %rd4215; @%p2366 bra $L__BB1_1740; bra.uni $L__BB1_1739; $L__BB1_1740: setp.gt.u64 %p2367, %rd4212, %rd4210; @%p2367 bra $L__BB1_1742; bra.uni $L__BB1_1741; $L__BB1_1742: shl.b64 %rd9958, %rd4215, 3; add.s64 %rd9959, %rd4213, %rd9958; shl.b64 %rd9960, %rd4210, 3; add.s64 %rd9961, %rd4213, %rd9960; cvt.u32.u64 %r4181, %rd4214; mov.b32 %f1258, %r4181; shr.u64 %rd9962, %rd4214, 32; cvt.u32.u64 %r4182, %rd9962; mov.b32 %f1259, %r4182; ld.u32 %rd9963, [%rd9959]; ld.u32 %rd9964, [%rd9959+4]; bfi.b64 %rd4216, %rd9964, %rd9963, 32, 32; cvt.u32.u64 %r4183, %rd4216; shr.u64 %rd9965, %rd4216, 32; cvt.u32.u64 %r4184, %rd9965; mov.b32 %f1260, %r4183; sub.f32 %f1261, %f1260, %f1258; mov.b32 %f5463, %r4184; sub.f32 %f1263, %f5463, %f1259; ld.u32 %rd9966, [%rd9961]; ld.u32 %rd9967, [%rd9961+4]; bfi.b64 %rd4217, %rd9967, %rd9966, 32, 32; cvt.u32.u64 %r4185, %rd4217; shr.u64 %rd9968, %rd4217, 32; cvt.u32.u64 %r4186, %rd9968; mov.b32 %f1264, %r4185; sub.f32 %f1265, %f1264, %f1258; mov.b32 %f1266, %r4186; sub.f32 %f1267, %f1266, %f1259; sub.f32 %f1268, %f1255, %f1258; sub.f32 %f1269, %f1256, %f1259; mul.f32 %f4578, %f1269, %f1263; fma.rn.f32 %f1270, %f1268, %f1261, %f4578; mul.f32 %f4579, %f1269, %f1267; fma.rn.f32 %f1271, %f1268, %f1265, %f4579; setp.le.f32 %p2368, %f1270, 0f00000000; setp.le.f32 %p2369, %f1271, 0f00000000; and.pred %p2370, %p2368, %p2369; @%p2370 bra $L__BB1_1780; bra.uni $L__BB1_1743; $L__BB1_1780: add.u64 %rd12829, %SP, 552; add.u64 %rd12835, %SP, 0; st.local.u64 [%rd4019], %rd4214; mov.u64 %rd12840, 2; mov.u64 %rd12826, %rd4032; mov.u64 %rd12827, %rd4031; mov.u64 %rd12828, %rd4031; mov.u64 %rd12830, %rd4031; mov.u64 %rd12831, %rd4031; mov.u64 %rd12832, %rd12829; mov.u64 %rd12833, %rd4019; mov.u64 %rd12834, %rd4019; mov.u64 %rd12836, %rd4019; mov.u64 %rd12837, %rd4019; mov.u64 %rd12838, %rd12835; mov.u64 %rd12839, %rd4020; $L__BB1_1781: setp.eq.s64 %p2423, %rd12840, 0; mov.u64 %rd12841, 1; @%p2423 bra $L__BB1_1783; add.s64 %rd12840, %rd12840, -1; add.s64 %rd10113, %rd12827, 8; setp.eq.s64 %p2424, %rd12830, %rd12826; selp.b64 %rd10114, %rd10113, %rd12830, %p2424; add.s64 %rd10115, %rd12828, 8; selp.b64 %rd10116, %rd10115, %rd12831, %p2424; add.s64 %rd10117, %rd12829, 8; selp.b64 %rd10118, %rd10117, %rd12832, %p2424; mov.u64 %rd12841, 0; setp.eq.s64 %p2425, %rd12840, 0; add.s64 %rd10119, %rd10114, 4; add.s64 %rd10120, %rd10116, 4; add.s64 %rd10121, %rd10118, 4; selp.b64 %rd4443, %rd10114, %rd10119, %p2425; selp.b64 %rd12831, %rd10116, %rd10120, %p2425; selp.b64 %rd12832, %rd10118, %rd10121, %p2425; selp.b64 %rd12827, %rd10113, %rd12827, %p2424; selp.b64 %rd12828, %rd10115, %rd12828, %p2424; selp.b64 %rd12829, %rd10117, %rd12829, %p2424; add.s64 %rd10122, %rd12830, 8; selp.b64 %rd12826, %rd10122, %rd12826, %p2424; add.s64 %rd10123, %rd12836, 8; setp.eq.s64 %p2426, %rd12833, %rd12839; selp.b64 %rd10124, %rd10123, %rd12833, %p2426; add.s64 %rd10125, %rd12837, 8; selp.b64 %rd10126, %rd10125, %rd12834, %p2426; add.s64 %rd10127, %rd12838, 8; selp.b64 %rd10128, %rd10127, %rd12835, %p2426; selp.b64 %rd12836, %rd10123, %rd12836, %p2426; selp.b64 %rd12837, %rd10125, %rd12837, %p2426; selp.b64 %rd12838, %rd10127, %rd12838, %p2426; add.s64 %rd10129, %rd12833, 8; selp.b64 %rd12839, %rd10129, %rd12839, %p2426; add.s64 %rd10130, %rd10124, 4; add.s64 %rd10131, %rd10126, 4; add.s64 %rd10132, %rd10128, 4; selp.b64 %rd12833, %rd10124, %rd10130, %p2425; selp.b64 %rd12834, %rd10126, %rd10131, %p2425; selp.b64 %rd12835, %rd10128, %rd10132, %p2425; ld.local.f32 %f4645, [%rd10126]; ld.local.f32 %f4646, [%rd10116]; setp.eq.f32 %p2427, %f4646, %f4645; mov.u64 %rd12830, %rd4443; @%p2427 bra $L__BB1_1781; $L__BB1_1783: mov.u64 %rd11253, 0; or.b64 %rd10134, %rd11253, %rd4214; mov.b64 {%r4228, %r4229}, %rd10134; mov.b64 {%r4230, %r4231}, %rd12841; cvt.u32.u64 %r4233, %rd11253; or.b32 %r5311, %r4233, %r4181; mov.u32 %r5312, 0; mov.b32 %f5467, %r4229; mov.b32 {%rs1038, %rs849}, %r4230; mov.u32 %r5313, %r5312; bra.uni $L__BB1_1784; $L__BB1_1743: sub.f32 %f1272, %f1255, %f1260; sub.f32 %f1273, %f1256, %f5463; mul.f32 %f4580, %f1263, %f1273; fma.rn.f32 %f1274, %f1261, %f1272, %f4580; mul.f32 %f4581, %f1273, %f1267; fma.rn.f32 %f1275, %f1272, %f1265, %f4581; setp.ge.f32 %p2371, %f1274, 0f00000000; setp.le.f32 %p2372, %f1275, %f1274; and.pred %p2373, %p2371, %p2372; @%p2373 bra $L__BB1_1776; bra.uni $L__BB1_1744; $L__BB1_1776: add.u64 %rd12813, %SP, 552; add.u64 %rd12819, %SP, 0; st.local.u64 [%rd4019], %rd4216; mov.u64 %rd12824, 2; mov.u64 %rd12810, %rd4032; mov.u64 %rd12811, %rd4031; mov.u64 %rd12812, %rd4031; mov.u64 %rd12814, %rd4031; mov.u64 %rd12815, %rd4031; mov.u64 %rd12816, %rd12813; mov.u64 %rd12817, %rd4019; mov.u64 %rd12818, %rd4019; mov.u64 %rd12820, %rd4019; mov.u64 %rd12821, %rd4019; mov.u64 %rd12822, %rd12819; mov.u64 %rd12823, %rd4022; $L__BB1_1777: setp.eq.s64 %p2418, %rd12824, 0; mov.u64 %rd12825, 1; @%p2418 bra $L__BB1_1779; add.s64 %rd12824, %rd12824, -1; add.s64 %rd10086, %rd12811, 8; setp.eq.s64 %p2419, %rd12814, %rd12810; selp.b64 %rd10087, %rd10086, %rd12814, %p2419; add.s64 %rd10088, %rd12812, 8; selp.b64 %rd10089, %rd10088, %rd12815, %p2419; add.s64 %rd10090, %rd12813, 8; selp.b64 %rd10091, %rd10090, %rd12816, %p2419; mov.u64 %rd12825, 0; setp.eq.s64 %p2420, %rd12824, 0; add.s64 %rd10092, %rd10087, 4; add.s64 %rd10093, %rd10089, 4; add.s64 %rd10094, %rd10091, 4; selp.b64 %rd4405, %rd10087, %rd10092, %p2420; selp.b64 %rd12815, %rd10089, %rd10093, %p2420; selp.b64 %rd12816, %rd10091, %rd10094, %p2420; selp.b64 %rd12811, %rd10086, %rd12811, %p2419; selp.b64 %rd12812, %rd10088, %rd12812, %p2419; selp.b64 %rd12813, %rd10090, %rd12813, %p2419; add.s64 %rd10095, %rd12814, 8; selp.b64 %rd12810, %rd10095, %rd12810, %p2419; add.s64 %rd10096, %rd12820, 8; setp.eq.s64 %p2421, %rd12817, %rd12823; selp.b64 %rd10097, %rd10096, %rd12817, %p2421; add.s64 %rd10098, %rd12821, 8; selp.b64 %rd10099, %rd10098, %rd12818, %p2421; add.s64 %rd10100, %rd12822, 8; selp.b64 %rd10101, %rd10100, %rd12819, %p2421; selp.b64 %rd12820, %rd10096, %rd12820, %p2421; selp.b64 %rd12821, %rd10098, %rd12821, %p2421; selp.b64 %rd12822, %rd10100, %rd12822, %p2421; add.s64 %rd10102, %rd12817, 8; selp.b64 %rd12823, %rd10102, %rd12823, %p2421; add.s64 %rd10103, %rd10097, 4; add.s64 %rd10104, %rd10099, 4; add.s64 %rd10105, %rd10101, 4; selp.b64 %rd12817, %rd10097, %rd10103, %p2420; selp.b64 %rd12818, %rd10099, %rd10104, %p2420; selp.b64 %rd12819, %rd10101, %rd10105, %p2420; ld.local.f32 %f4643, [%rd10099]; ld.local.f32 %f4644, [%rd10089]; setp.eq.f32 %p2422, %f4644, %f4643; mov.u64 %rd12814, %rd4405; @%p2422 bra $L__BB1_1777; $L__BB1_1779: mov.u64 %rd11252, 0; or.b64 %rd10107, %rd11252, %rd4216; mov.b64 {%r4220, %r4221}, %rd10107; mov.b64 {%r4222, %r4223}, %rd12825; cvt.u32.u64 %r4225, %rd11252; or.b32 %r5311, %r4225, %r4183; mov.u32 %r5312, 0; mov.b32 %f5467, %r4221; mov.u32 %r5313, 1; mov.b32 {%rs1038, %rs845}, %r4222; bra.uni $L__BB1_1784; $L__BB1_1744: sub.f32 %f1276, %f1255, %f1264; sub.f32 %f1277, %f1256, %f1266; mul.f32 %f4582, %f1263, %f1277; fma.rn.f32 %f1278, %f1261, %f1276, %f4582; mul.f32 %f4583, %f1267, %f1277; fma.rn.f32 %f1279, %f1265, %f1276, %f4583; setp.ge.f32 %p2374, %f1279, 0f00000000; setp.le.f32 %p2375, %f1278, %f1279; and.pred %p2376, %p2375, %p2374; @%p2376 bra $L__BB1_1772; bra.uni $L__BB1_1745; $L__BB1_1772: add.u64 %rd12797, %SP, 552; add.u64 %rd12803, %SP, 0; st.local.u64 [%rd4019], %rd4217; mov.u64 %rd12808, 2; mov.u64 %rd12794, %rd4032; mov.u64 %rd12795, %rd4031; mov.u64 %rd12796, %rd4031; mov.u64 %rd12798, %rd4031; mov.u64 %rd12799, %rd4031; mov.u64 %rd12800, %rd12797; mov.u64 %rd12801, %rd4019; mov.u64 %rd12802, %rd4019; mov.u64 %rd12804, %rd4019; mov.u64 %rd12805, %rd4019; mov.u64 %rd12806, %rd12803; mov.u64 %rd12807, %rd4024; $L__BB1_1773: setp.eq.s64 %p2413, %rd12808, 0; mov.u64 %rd12809, 1; @%p2413 bra $L__BB1_1775; add.s64 %rd12808, %rd12808, -1; add.s64 %rd10059, %rd12795, 8; setp.eq.s64 %p2414, %rd12798, %rd12794; selp.b64 %rd10060, %rd10059, %rd12798, %p2414; add.s64 %rd10061, %rd12796, 8; selp.b64 %rd10062, %rd10061, %rd12799, %p2414; add.s64 %rd10063, %rd12797, 8; selp.b64 %rd10064, %rd10063, %rd12800, %p2414; mov.u64 %rd12809, 0; setp.eq.s64 %p2415, %rd12808, 0; add.s64 %rd10065, %rd10060, 4; add.s64 %rd10066, %rd10062, 4; add.s64 %rd10067, %rd10064, 4; selp.b64 %rd4367, %rd10060, %rd10065, %p2415; selp.b64 %rd12799, %rd10062, %rd10066, %p2415; selp.b64 %rd12800, %rd10064, %rd10067, %p2415; selp.b64 %rd12795, %rd10059, %rd12795, %p2414; selp.b64 %rd12796, %rd10061, %rd12796, %p2414; selp.b64 %rd12797, %rd10063, %rd12797, %p2414; add.s64 %rd10068, %rd12798, 8; selp.b64 %rd12794, %rd10068, %rd12794, %p2414; add.s64 %rd10069, %rd12804, 8; setp.eq.s64 %p2416, %rd12801, %rd12807; selp.b64 %rd10070, %rd10069, %rd12801, %p2416; add.s64 %rd10071, %rd12805, 8; selp.b64 %rd10072, %rd10071, %rd12802, %p2416; add.s64 %rd10073, %rd12806, 8; selp.b64 %rd10074, %rd10073, %rd12803, %p2416; selp.b64 %rd12804, %rd10069, %rd12804, %p2416; selp.b64 %rd12805, %rd10071, %rd12805, %p2416; selp.b64 %rd12806, %rd10073, %rd12806, %p2416; add.s64 %rd10075, %rd12801, 8; selp.b64 %rd12807, %rd10075, %rd12807, %p2416; add.s64 %rd10076, %rd10070, 4; add.s64 %rd10077, %rd10072, 4; add.s64 %rd10078, %rd10074, 4; selp.b64 %rd12801, %rd10070, %rd10076, %p2415; selp.b64 %rd12802, %rd10072, %rd10077, %p2415; selp.b64 %rd12803, %rd10074, %rd10078, %p2415; ld.local.f32 %f4641, [%rd10072]; ld.local.f32 %f4642, [%rd10062]; setp.eq.f32 %p2417, %f4642, %f4641; mov.u64 %rd12798, %rd4367; @%p2417 bra $L__BB1_1773; $L__BB1_1775: mov.u64 %rd11251, 0; or.b64 %rd10080, %rd11251, %rd4217; mov.b64 {%r4212, %r4213}, %rd10080; mov.b64 {%r4214, %r4215}, %rd12809; cvt.u32.u64 %r4217, %rd11251; or.b32 %r5311, %r4217, %r4185; mov.u32 %r5312, 0; mov.b32 %f5467, %r4213; mov.b32 {%rs1038, %rs841}, %r4214; mov.u32 %r5313, 2; bra.uni $L__BB1_1784; $L__BB1_1745: sub.f32 %f1280, %f1264, %f1260; sub.f32 %f1281, %f1266, %f5463; mul.f32 %f4584, %f1263, %f1265; mul.f32 %f4585, %f1261, %f1267; sub.f32 %f1282, %f4585, %f4584; mul.f32 %f4586, %f1268, %f1263; mul.f32 %f4587, %f1269, %f1261; sub.f32 %f4588, %f4587, %f4586; mul.f32 %f4589, %f4588, %f1282; setp.lt.f32 %p2377, %f4589, 0f00000000; setp.ge.f32 %p2378, %f1270, 0f00000000; and.pred %p2379, %p2378, %p2377; setp.le.f32 %p2380, %f1274, 0f00000000; and.pred %p2381, %p2380, %p2379; mov.u16 %rs1037, 0; @%p2381 bra $L__BB1_1748; mul.f32 %f4590, %f1265, %f1277; mul.f32 %f4591, %f1276, %f1267; sub.f32 %f4592, %f4590, %f4591; mul.f32 %f4593, %f1282, %f4592; setp.gt.f32 %p2382, %f4593, 0f80000000; setp.ge.f32 %p2383, %f1271, 0f00000000; and.pred %p2384, %p2383, %p2382; setp.le.f32 %p2385, %f1279, 0f00000000; and.pred %p2386, %p2385, %p2384; mov.u16 %rs1037, 1; @%p2386 bra $L__BB1_1748; mul.f32 %f4594, %f1280, %f1273; mul.f32 %f4595, %f1272, %f1281; sub.f32 %f4596, %f4594, %f4595; mul.f32 %f4597, %f1282, %f4596; setp.lt.f32 %p2387, %f4597, 0f00000000; sub.f32 %f4598, %f1275, %f1274; setp.ge.f32 %p2388, %f4598, 0f00000000; and.pred %p2389, %p2388, %p2387; sub.f32 %f4599, %f1278, %f1279; setp.ge.f32 %p2390, %f4599, 0f00000000; and.pred %p2391, %p2390, %p2389; selp.b16 %rs1037, 2, 3, %p2391; $L__BB1_1748: mul.f32 %f4600, %f1263, %f1263; fma.rn.f32 %f4601, %f1261, %f1261, %f4600; add.f32 %f1283, %f4601, 0f00000000; mul.f32 %f4602, %f1267, %f1267; fma.rn.f32 %f4603, %f1265, %f1265, %f4602; add.f32 %f1284, %f4603, 0f00000000; mul.f32 %f4604, %f1281, %f1281; fma.rn.f32 %f4605, %f1280, %f1280, %f4604; add.f32 %f1285, %f4605, 0f00000000; setp.eq.s16 %p2392, %rs1037, 1; @%p2392 bra $L__BB1_1763; setp.eq.s16 %p2393, %rs1037, 2; @%p2393 bra $L__BB1_1759; setp.ne.s16 %p2394, %rs1037, 3; @%p2394 bra $L__BB1_1767; sub.f32 %f4606, %f1270, %f1274; div.rn.f32 %f1286, %f1270, %f4606; sub.f32 %f4607, %f1271, %f1279; div.rn.f32 %f1287, %f1271, %f4607; sub.f32 %f4608, %f1275, %f1274; add.f32 %f4609, %f1278, %f4608; sub.f32 %f4610, %f4609, %f1279; div.rn.f32 %f5465, %f4608, %f4610; mul.f32 %f4611, %f1269, %f1269; fma.rn.f32 %f4612, %f1268, %f1268, %f4611; add.f32 %f4613, %f4612, 0f00000000; mul.f32 %f4614, %f1283, %f1286; mul.f32 %f4615, %f1286, %f4614; sub.f32 %f1289, %f4613, %f4615; mul.f32 %f4616, %f1284, %f5465; mul.f32 %f4617, %f5465, %f4616; sub.f32 %f1290, %f4613, %f4617; mul.f32 %f4618, %f1273, %f1273; fma.rn.f32 %f4619, %f1272, %f1272, %f4618; add.f32 %f4620, %f4619, 0f00000000; mul.f32 %f4621, %f1285, %f1287; mul.f32 %f4622, %f1287, %f4621; sub.f32 %f1291, %f4620, %f4622; setp.lt.f32 %p2395, %f1289, %f1290; @%p2395 bra $L__BB1_1755; bra.uni $L__BB1_1752; $L__BB1_1755: setp.lt.f32 %p2397, %f1289, %f1291; @%p2397 bra $L__BB1_1757; bra.uni $L__BB1_1756; $L__BB1_1757: mul.f32 %f5464, %f1263, %f1286; fma.rn.f32 %f5462, %f1261, %f1286, %f1258; mov.u32 %r5313, 0; mov.f32 %f5463, %f1259; mov.f32 %f5465, %f1286; bra.uni $L__BB1_1758; $L__BB1_1759: add.u64 %rd12747, %SP, 552; add.u64 %rd12753, %SP, 0; mul.f32 %f4625, %f1273, %f1281; fma.rn.f32 %f4626, %f1272, %f1280, %f4625; div.rn.f32 %f5466, %f4626, %f1285; fma.rn.f32 %f4627, %f1280, %f5466, %f1260; mov.b32 %r4194, %f4627; fma.rn.f32 %f4628, %f1281, %f5466, %f5463; mov.b32 %r4195, %f4628; cvt.u64.u32 %rd9972, %r4195; cvt.u64.u32 %rd9973, %r4194; bfi.b64 %rd4225, %rd9972, %rd9973, 32, 32; st.local.u64 [%rd4019], %rd4225; mov.u64 %rd12758, 2; mov.u64 %rd12744, %rd4032; mov.u64 %rd12745, %rd4031; mov.u64 %rd12746, %rd4031; mov.u64 %rd12748, %rd4031; mov.u64 %rd12749, %rd4031; mov.u64 %rd12750, %rd12747; mov.u64 %rd12751, %rd4019; mov.u64 %rd12752, %rd4019; mov.u64 %rd12754, %rd4019; mov.u64 %rd12755, %rd4019; mov.u64 %rd12756, %rd12753; mov.u64 %rd12757, %rd4030; $L__BB1_1760: setp.eq.s64 %p2398, %rd12758, 0; mov.u64 %rd12793, 1; @%p2398 bra $L__BB1_1762; add.s64 %rd12758, %rd12758, -1; add.s64 %rd9978, %rd12745, 8; setp.eq.s64 %p2399, %rd12748, %rd12744; selp.b64 %rd9979, %rd9978, %rd12748, %p2399; add.s64 %rd9980, %rd12746, 8; selp.b64 %rd9981, %rd9980, %rd12749, %p2399; add.s64 %rd9982, %rd12747, 8; selp.b64 %rd9983, %rd9982, %rd12750, %p2399; mov.u64 %rd12793, 0; setp.eq.s64 %p2400, %rd12758, 0; add.s64 %rd9984, %rd9979, 4; add.s64 %rd9985, %rd9981, 4; add.s64 %rd9986, %rd9983, 4; selp.b64 %rd4242, %rd9979, %rd9984, %p2400; selp.b64 %rd12749, %rd9981, %rd9985, %p2400; selp.b64 %rd12750, %rd9983, %rd9986, %p2400; selp.b64 %rd12745, %rd9978, %rd12745, %p2399; selp.b64 %rd12746, %rd9980, %rd12746, %p2399; selp.b64 %rd12747, %rd9982, %rd12747, %p2399; add.s64 %rd9987, %rd12748, 8; selp.b64 %rd12744, %rd9987, %rd12744, %p2399; add.s64 %rd9988, %rd12754, 8; setp.eq.s64 %p2401, %rd12751, %rd12757; selp.b64 %rd9989, %rd9988, %rd12751, %p2401; add.s64 %rd9990, %rd12755, 8; selp.b64 %rd9991, %rd9990, %rd12752, %p2401; add.s64 %rd9992, %rd12756, 8; selp.b64 %rd9993, %rd9992, %rd12753, %p2401; selp.b64 %rd12754, %rd9988, %rd12754, %p2401; selp.b64 %rd12755, %rd9990, %rd12755, %p2401; selp.b64 %rd12756, %rd9992, %rd12756, %p2401; add.s64 %rd9994, %rd12751, 8; selp.b64 %rd12757, %rd9994, %rd12757, %p2401; add.s64 %rd9995, %rd9989, 4; add.s64 %rd9996, %rd9991, 4; add.s64 %rd9997, %rd9993, 4; selp.b64 %rd12751, %rd9989, %rd9995, %p2400; selp.b64 %rd12752, %rd9991, %rd9996, %p2400; selp.b64 %rd12753, %rd9993, %rd9997, %p2400; ld.local.f32 %f4629, [%rd9991]; ld.local.f32 %f4630, [%rd9981]; setp.eq.f32 %p2402, %f4630, %f4629; mov.u64 %rd12748, %rd4242; @%p2402 bra $L__BB1_1760; $L__BB1_1762: mov.u64 %rd11248, 0; or.b64 %rd12792, %rd11248, %rd4225; mov.u32 %r5313, 1; bra.uni $L__BB1_1771; $L__BB1_1763: add.u64 %rd12763, %SP, 552; add.u64 %rd12769, %SP, 0; div.rn.f32 %f5466, %f1271, %f1284; fma.rn.f32 %f4631, %f1265, %f5466, %f1258; mov.b32 %r4197, %f4631; fma.rn.f32 %f4632, %f1267, %f5466, %f1259; mov.b32 %r4198, %f4632; cvt.u64.u32 %rd9999, %r4198; cvt.u64.u32 %rd10000, %r4197; bfi.b64 %rd4266, %rd9999, %rd10000, 32, 32; st.local.u64 [%rd4019], %rd4266; mov.u64 %rd12774, 2; mov.u64 %rd12760, %rd4032; mov.u64 %rd12761, %rd4031; mov.u64 %rd12762, %rd4031; mov.u64 %rd12764, %rd4031; mov.u64 %rd12765, %rd4031; mov.u64 %rd12766, %rd12763; mov.u64 %rd12767, %rd4019; mov.u64 %rd12768, %rd4019; mov.u64 %rd12770, %rd4019; mov.u64 %rd12771, %rd4019; mov.u64 %rd12772, %rd12769; mov.u64 %rd12773, %rd4028; $L__BB1_1764: setp.eq.s64 %p2403, %rd12774, 0; mov.u64 %rd12793, 1; @%p2403 bra $L__BB1_1766; add.s64 %rd12774, %rd12774, -1; add.s64 %rd10005, %rd12761, 8; setp.eq.s64 %p2404, %rd12764, %rd12760; selp.b64 %rd10006, %rd10005, %rd12764, %p2404; add.s64 %rd10007, %rd12762, 8; selp.b64 %rd10008, %rd10007, %rd12765, %p2404; add.s64 %rd10009, %rd12763, 8; selp.b64 %rd10010, %rd10009, %rd12766, %p2404; mov.u64 %rd12793, 0; setp.eq.s64 %p2405, %rd12774, 0; add.s64 %rd10011, %rd10006, 4; add.s64 %rd10012, %rd10008, 4; add.s64 %rd10013, %rd10010, 4; selp.b64 %rd4283, %rd10006, %rd10011, %p2405; selp.b64 %rd12765, %rd10008, %rd10012, %p2405; selp.b64 %rd12766, %rd10010, %rd10013, %p2405; selp.b64 %rd12761, %rd10005, %rd12761, %p2404; selp.b64 %rd12762, %rd10007, %rd12762, %p2404; selp.b64 %rd12763, %rd10009, %rd12763, %p2404; add.s64 %rd10014, %rd12764, 8; selp.b64 %rd12760, %rd10014, %rd12760, %p2404; add.s64 %rd10015, %rd12770, 8; setp.eq.s64 %p2406, %rd12767, %rd12773; selp.b64 %rd10016, %rd10015, %rd12767, %p2406; add.s64 %rd10017, %rd12771, 8; selp.b64 %rd10018, %rd10017, %rd12768, %p2406; add.s64 %rd10019, %rd12772, 8; selp.b64 %rd10020, %rd10019, %rd12769, %p2406; selp.b64 %rd12770, %rd10015, %rd12770, %p2406; selp.b64 %rd12771, %rd10017, %rd12771, %p2406; selp.b64 %rd12772, %rd10019, %rd12772, %p2406; add.s64 %rd10021, %rd12767, 8; selp.b64 %rd12773, %rd10021, %rd12773, %p2406; add.s64 %rd10022, %rd10016, 4; add.s64 %rd10023, %rd10018, 4; add.s64 %rd10024, %rd10020, 4; selp.b64 %rd12767, %rd10016, %rd10022, %p2405; selp.b64 %rd12768, %rd10018, %rd10023, %p2405; selp.b64 %rd12769, %rd10020, %rd10024, %p2405; ld.local.f32 %f4633, [%rd10018]; ld.local.f32 %f4634, [%rd10008]; setp.eq.f32 %p2407, %f4634, %f4633; mov.u64 %rd12764, %rd4283; @%p2407 bra $L__BB1_1764; $L__BB1_1766: mov.u64 %rd11249, 0; or.b64 %rd12792, %rd11249, %rd4266; mov.u32 %r5313, 2; bra.uni $L__BB1_1771; $L__BB1_1767: div.rn.f32 %f5466, %f1270, %f1283; fma.rn.f32 %f4635, %f1261, %f5466, %f1258; mov.b32 %r4200, %f4635; fma.rn.f32 %f4636, %f1263, %f5466, %f1259; mov.b32 %r4201, %f4636; cvt.u64.u32 %rd10026, %r4201; cvt.u64.u32 %rd10027, %r4200; bfi.b64 %rd4307, %rd10026, %rd10027, 32, 32; st.local.u64 [%rd4019], %rd4307; mov.u64 %rd12790, 2; mov.u64 %rd12776, %rd4032; mov.u64 %rd12777, %rd4031; mov.u64 %rd12778, %rd4031; mov.u64 %rd12779, %rd9754; mov.u64 %rd12780, %rd4031; mov.u64 %rd12781, %rd4031; mov.u64 %rd12782, %rd9754; mov.u64 %rd12783, %rd4019; mov.u64 %rd12784, %rd4019; mov.u64 %rd12785, %rd9748; mov.u64 %rd12786, %rd4019; mov.u64 %rd12787, %rd4019; mov.u64 %rd12788, %rd9748; mov.u64 %rd12789, %rd4026; $L__BB1_1768: setp.eq.s64 %p2408, %rd12790, 0; mov.u64 %rd12793, 1; @%p2408 bra $L__BB1_1770; add.s64 %rd12790, %rd12790, -1; add.s64 %rd10032, %rd12777, 8; setp.eq.s64 %p2409, %rd12780, %rd12776; selp.b64 %rd10033, %rd10032, %rd12780, %p2409; add.s64 %rd10034, %rd12778, 8; selp.b64 %rd10035, %rd10034, %rd12781, %p2409; add.s64 %rd10036, %rd12779, 8; selp.b64 %rd10037, %rd10036, %rd12782, %p2409; mov.u64 %rd12793, 0; setp.eq.s64 %p2410, %rd12790, 0; add.s64 %rd10038, %rd10033, 4; add.s64 %rd10039, %rd10035, 4; add.s64 %rd10040, %rd10037, 4; selp.b64 %rd4324, %rd10033, %rd10038, %p2410; selp.b64 %rd12781, %rd10035, %rd10039, %p2410; selp.b64 %rd12782, %rd10037, %rd10040, %p2410; selp.b64 %rd12777, %rd10032, %rd12777, %p2409; selp.b64 %rd12778, %rd10034, %rd12778, %p2409; selp.b64 %rd12779, %rd10036, %rd12779, %p2409; add.s64 %rd10041, %rd12780, 8; selp.b64 %rd12776, %rd10041, %rd12776, %p2409; add.s64 %rd10042, %rd12786, 8; setp.eq.s64 %p2411, %rd12783, %rd12789; selp.b64 %rd10043, %rd10042, %rd12783, %p2411; add.s64 %rd10044, %rd12787, 8; selp.b64 %rd10045, %rd10044, %rd12784, %p2411; add.s64 %rd10046, %rd12788, 8; selp.b64 %rd10047, %rd10046, %rd12785, %p2411; selp.b64 %rd12786, %rd10042, %rd12786, %p2411; selp.b64 %rd12787, %rd10044, %rd12787, %p2411; selp.b64 %rd12788, %rd10046, %rd12788, %p2411; add.s64 %rd10048, %rd12783, 8; selp.b64 %rd12789, %rd10048, %rd12789, %p2411; add.s64 %rd10049, %rd10043, 4; add.s64 %rd10050, %rd10045, 4; add.s64 %rd10051, %rd10047, 4; selp.b64 %rd12783, %rd10043, %rd10049, %p2410; selp.b64 %rd12784, %rd10045, %rd10050, %p2410; selp.b64 %rd12785, %rd10047, %rd10051, %p2410; ld.local.f32 %f4637, [%rd10045]; ld.local.f32 %f4638, [%rd10035]; setp.eq.f32 %p2412, %f4638, %f4637; mov.u64 %rd12780, %rd4324; @%p2412 bra $L__BB1_1768; $L__BB1_1770: mov.u64 %rd11250, 0; or.b64 %rd12792, %rd11250, %rd4307; mov.u32 %r5313, 0; $L__BB1_1771: mov.f32 %f4639, 0f3F800000; sub.f32 %f4640, %f4639, %f5466; mov.b32 %r4204, %f4640; mov.b32 %r4205, %f5466; cvt.u64.u32 %rd10052, %r4205; cvt.u64.u32 %rd10053, %r4204; bfi.b64 %rd12842, %rd10052, %rd10053, 32, 32; mov.b64 {%r4206, %r4207}, %rd12793; mov.b64 {%r4208, %r4209}, %rd12792; cvt.u32.u64 %r5311, %rd12792; mov.b32 %f5467, %r4209; mov.u32 %r5312, 1; mov.b32 {%rs1038, %rs837}, %r4206; bra.uni $L__BB1_1784; $L__BB1_1752: setp.lt.f32 %p2396, %f1290, %f1291; @%p2396 bra $L__BB1_1754; bra.uni $L__BB1_1753; $L__BB1_1754: mul.f32 %f5464, %f1267, %f1287; fma.rn.f32 %f5462, %f1265, %f1287, %f1258; mov.u32 %r5313, 2; mov.f32 %f5463, %f1259; mov.f32 %f5465, %f1287; bra.uni $L__BB1_1758; $L__BB1_1756: mul.f32 %f5464, %f1281, %f5465; fma.rn.f32 %f5462, %f1280, %f5465, %f1260; mov.u32 %r5313, 1; bra.uni $L__BB1_1758; $L__BB1_1753: mul.f32 %f5464, %f1281, %f5465; fma.rn.f32 %f5462, %f1280, %f5465, %f1260; mov.u32 %r5313, 1; $L__BB1_1758: add.f32 %f5467, %f5463, %f5464; mov.f32 %f4623, 0f3F800000; sub.f32 %f4624, %f4623, %f5465; mov.b32 %r4192, %f4624; mov.b32 %r4193, %f5465; cvt.u64.u32 %rd9969, %r4193; cvt.u64.u32 %rd9970, %r4192; bfi.b64 %rd12842, %rd9969, %rd9970, 32, 32; mov.b32 %r5311, %f5462; mov.u32 %r5312, 1; mov.u16 %rs1038, 1; $L__BB1_1784: mov.b32 %f4647, %r5311; sub.f32 %f4648, %f4647, %f1255; mul.f32 %f4649, %f4648, %f4648; sub.f32 %f4650, %f5467, %f1256; fma.rn.f32 %f4651, %f4650, %f4650, %f4649; add.f32 %f4652, %f4651, 0f00000000; sqrt.rn.f32 %f4653, %f4652; shl.b64 %rd10135, %rd4206, 2; add.s64 %rd10136, %rd24, %rd10135; st.local.f32 [%rd10136+-4], %f4653; mul.lo.s64 %rd10137, %rd4206, 36; add.s64 %rd10138, %rd4011, %rd10137; st.local.u32 [%rd10138+-36], %r5311; st.local.f32 [%rd10138+-32], %f5467; mov.u16 %rs850, 0; st.local.v4.u8 [%rd10138+-28], {%rs1038, %rs850, %rs850, %rs850}; st.local.u32 [%rd10138+-24], %r1304; st.local.u32 [%rd10138+-20], %r5312; st.local.u32 [%rd10138+-16], %r5313; shr.u64 %rd10139, %rd12842, 32; st.local.u32 [%rd10138+-8], %rd10139; st.local.u32 [%rd10138+-12], %rd12842; $L__BB1_1785: setp.lt.u64 %p2428, %rd4206, 4; add.s64 %rd4206, %rd4206, 1; @%p2428 bra $L__BB1_1732; ld.local.v2.u64 {%rd12843, %rd12844}, [%rd24]; ld.local.v4.u32 {%r5323, %r5324, %r5325, %r4237}, [%rd4011]; ld.local.u32 %r5326, [%rd4011+16]; ld.local.u32 %rd10142, [%rd4011+36]; ld.local.u32 %rd10143, [%rd4011+40]; bfi.b64 %rd10144, %rd10143, %rd10142, 32, 32; mov.b64 {%r5320, %r5321}, %rd10144; ld.local.u32 %r5322, [%rd4011+44]; ld.local.u32 %r5327, [%rd4011+52]; ld.local.u32 %r5319, [%rd4011+80]; ld.local.u64 %rd10145, [%rd4011+72]; mov.b64 {%r5317, %r5318}, %rd10145; ld.local.u32 %r5328, [%rd4011+88]; ld.local.u32 %rd10146, [%rd4011+108]; ld.local.u32 %rd10147, [%rd4011+112]; bfi.b64 %rd10148, %rd10147, %rd10146, 32, 32; mov.b64 {%r5314, %r5315}, %rd10148; ld.local.u32 %r5316, [%rd4011+116]; ld.local.u32 %r5329, [%rd4011+124]; bra.uni $L__BB1_1787; $L__BB1_1730: mov.u32 %r5326, 4; mov.u32 %r5327, %r5326; mov.u32 %r5328, %r5326; mov.u32 %r5329, %r5326; $L__BB1_1787: and.b64 %rd10149, %rd4201, 1; setp.eq.b64 %p2429, %rd10149, 1; mov.pred %p2430, 0; xor.pred %p2431, %p2429, %p2430; not.pred %p2432, %p2431; mov.b64 {%r1347, %r1348}, %rd12843; mov.b32 %f1314, %r1347; mov.b32 %f1315, %r1348; mov.b64 {%r1349, %r1350}, %rd12844; mov.b32 %f1316, %r1349; mov.b32 %f1317, %r1350; @%p2432 bra $L__BB1_1796; bra.uni $L__BB1_1788; $L__BB1_1796: and.b64 %rd10165, %rd4201, 2; setp.eq.s64 %p2446, %rd10165, 0; @%p2446 bra $L__BB1_1805; bra.uni $L__BB1_1797; $L__BB1_1805: and.b64 %rd10181, %rd4201, 4; setp.eq.s64 %p2460, %rd10181, 0; @%p2460 bra $L__BB1_1814; bra.uni $L__BB1_1806; $L__BB1_1814: and.b64 %rd10197, %rd4201, 8; setp.eq.s64 %p2474, %rd10197, 0; @%p2474 bra $L__BB1_1722; ld.u8 %rs857, [%rd4190+88]; and.b16 %rs858, %rs857, 1; setp.eq.b16 %p2475, %rs858, 1; mov.pred %p2476, 0; xor.pred %p2477, %p2475, %p2476; not.pred %p2478, %p2477; @%p2478 bra $L__BB1_1818; bra.uni $L__BB1_1816; $L__BB1_1818: ld.u32 %r1398, [%rd4190+76]; cvt.u64.u32 %rd10201, %r1398; setp.le.u64 %p2485, %rd4178, %rd10201; @%p2485 bra $L__BB1_1722; neg.f32 %f1321, %f1317; setp.lt.u32 %p2486, %r1303, 64; @%p2486 bra $L__BB1_1821; bra.uni $L__BB1_1820; $L__BB1_1821: mul.wide.u32 %rd10211, %r1303, 8; add.s64 %rd10212, %rd4015, %rd10211; mov.u64 %rd12851, 0; st.local.u32 [%rd10212], %r1398; st.local.f32 [%rd10212+4], %f1321; add.s32 %r1303, %r1303, 1; st.local.u32 [%rd4015+512], %r1303; mov.u64 %rd12852, %rd12851; bra.uni $L__BB1_1822; $L__BB1_1788: ld.u8 %rs851, [%rd4190+88]; and.b16 %rs852, %rs851, 1; setp.eq.b16 %p2433, %rs852, 1; xor.pred %p2435, %p2433, %p2430; not.pred %p2436, %p2435; @%p2436 bra $L__BB1_1791; bra.uni $L__BB1_1789; $L__BB1_1791: ld.u32 %r1356, [%rd4190+64]; cvt.u64.u32 %rd10153, %r1356; setp.le.u64 %p2443, %rd4178, %rd10153; @%p2443 bra $L__BB1_1796; neg.f32 %f1318, %f1314; setp.lt.u32 %p2444, %r1303, 64; @%p2444 bra $L__BB1_1794; bra.uni $L__BB1_1793; $L__BB1_1794: add.s32 %r4240, %r1302, -1; mul.wide.u32 %rd10163, %r4240, 8; add.s64 %rd10164, %rd4015, %rd10163; mov.u64 %rd12845, 0; st.local.u32 [%rd10164], %r1356; st.local.f32 [%rd10164+4], %f1318; add.s32 %r1303, %r1303, 1; st.local.u32 [%rd4015+512], %r1303; mov.u64 %rd12846, %rd12845; bra.uni $L__BB1_1795; $L__BB1_1797: ld.u8 %rs853, [%rd4190+88]; and.b16 %rs854, %rs853, 1; setp.eq.b16 %p2447, %rs854, 1; mov.pred %p2448, 0; xor.pred %p2449, %p2447, %p2448; not.pred %p2450, %p2449; @%p2450 bra $L__BB1_1800; bra.uni $L__BB1_1798; $L__BB1_1800: ld.u32 %r1370, [%rd4190+68]; cvt.u64.u32 %rd10169, %r1370; setp.le.u64 %p2457, %rd4178, %rd10169; @%p2457 bra $L__BB1_1805; neg.f32 %f1319, %f1315; setp.lt.u32 %p2458, %r1303, 64; @%p2458 bra $L__BB1_1803; bra.uni $L__BB1_1802; $L__BB1_1803: mul.wide.u32 %rd10179, %r1303, 8; add.s64 %rd10180, %rd4015, %rd10179; mov.u64 %rd12847, 0; st.local.u32 [%rd10180], %r1370; st.local.f32 [%rd10180+4], %f1319; add.s32 %r1303, %r1303, 1; st.local.u32 [%rd4015+512], %r1303; mov.u64 %rd12848, %rd12847; bra.uni $L__BB1_1804; $L__BB1_1806: ld.u8 %rs855, [%rd4190+88]; and.b16 %rs856, %rs855, 1; setp.eq.b16 %p2461, %rs856, 1; mov.pred %p2462, 0; xor.pred %p2463, %p2461, %p2462; not.pred %p2464, %p2463; @%p2464 bra $L__BB1_1809; bra.uni $L__BB1_1807; $L__BB1_1809: ld.u32 %r1384, [%rd4190+72]; cvt.u64.u32 %rd10185, %r1384; setp.le.u64 %p2471, %rd4178, %rd10185; @%p2471 bra $L__BB1_1814; neg.f32 %f1320, %f1316; setp.lt.u32 %p2472, %r1303, 64; @%p2472 bra $L__BB1_1812; bra.uni $L__BB1_1811; $L__BB1_1812: mul.wide.u32 %rd10195, %r1303, 8; add.s64 %rd10196, %rd4015, %rd10195; mov.u64 %rd12849, 0; st.local.u32 [%rd10196], %r1384; st.local.f32 [%rd10196+4], %f1320; add.s32 %r1303, %r1303, 1; st.local.u32 [%rd4015+512], %r1303; mov.u64 %rd12850, %rd12849; bra.uni $L__BB1_1813; $L__BB1_1789: setp.leu.f32 %p2437, %f1257, %f1314; setp.eq.s32 %p2438, %r5326, 4; or.pred %p2439, %p2438, %p2437; @%p2439 bra $L__BB1_1796; ld.u32 %r4238, [%rd4190+64]; cvt.u64.u32 %rd10150, %r4238; setp.le.u64 %p2440, %rd4181, %rd10150; mul.wide.u32 %rd10151, %r4238, 12; add.s64 %rd10152, %rd4182, %rd10151; setp.eq.s64 %p2441, %rd10152, 0; or.pred %p2442, %p2440, %p2441; selp.b32 %r1298, %r1298, %r5325, %p2442; selp.b32 %r1297, %r1297, %r5324, %p2442; selp.b32 %r1296, %r1296, %r5323, %p2442; selp.b32 %r1300, %r1300, %r5326, %p2442; selp.b32 %r1301, %r1301, %r1347, %p2442; bra.uni $L__BB1_1796; $L__BB1_1798: mov.b32 %f4654, %r1301; setp.leu.f32 %p2451, %f4654, %f1315; setp.eq.s32 %p2452, %r5327, 4; or.pred %p2453, %p2452, %p2451; @%p2453 bra $L__BB1_1805; ld.u32 %r4246, [%rd4190+68]; cvt.u64.u32 %rd10166, %r4246; setp.le.u64 %p2454, %rd4181, %rd10166; mul.wide.u32 %rd10167, %r4246, 12; add.s64 %rd10168, %rd4182, %rd10167; setp.eq.s64 %p2455, %rd10168, 0; or.pred %p2456, %p2454, %p2455; selp.b32 %r1298, %r1298, %r5322, %p2456; selp.b32 %r1297, %r1297, %r5321, %p2456; selp.b32 %r1296, %r1296, %r5320, %p2456; selp.b32 %r1300, %r1300, %r5327, %p2456; selp.b32 %r1301, %r1301, %r1348, %p2456; bra.uni $L__BB1_1805; $L__BB1_1816: mov.b32 %f4656, %r1301; setp.leu.f32 %p2479, %f4656, %f1317; setp.eq.s32 %p2480, %r5329, 4; or.pred %p2481, %p2480, %p2479; @%p2481 bra $L__BB1_1722; bra.uni $L__BB1_1817; $L__BB1_1807: mov.b32 %f4655, %r1301; setp.leu.f32 %p2465, %f4655, %f1316; setp.eq.s32 %p2466, %r5328, 4; or.pred %p2467, %p2466, %p2465; @%p2467 bra $L__BB1_1814; ld.u32 %r4253, [%rd4190+72]; cvt.u64.u32 %rd10182, %r4253; setp.le.u64 %p2468, %rd4181, %rd10182; mul.wide.u32 %rd10183, %r4253, 12; add.s64 %rd10184, %rd4182, %rd10183; setp.eq.s64 %p2469, %rd10184, 0; or.pred %p2470, %p2468, %p2469; selp.b32 %r1298, %r1298, %r5319, %p2470; selp.b32 %r1297, %r1297, %r5318, %p2470; selp.b32 %r1296, %r1296, %r5317, %p2470; selp.b32 %r1300, %r1300, %r5328, %p2470; selp.b32 %r1301, %r1301, %r1349, %p2470; bra.uni $L__BB1_1814; $L__BB1_1820: mov.u64 %rd12852, 1; shl.b64 %rd12851, %rd10201, 32; $L__BB1_1822: mov.u64 %rd11263, 0; cvt.u32.u64 %r4262, %rd11263; cvt.u32.u64 %r4263, %rd12851; or.b32 %r4264, %r4263, %r4262; cvt.u32.u64 %r4265, %rd12852; or.b32 %r4266, %r4264, %r4265; setp.eq.s32 %p2487, %r4266, 0; @%p2487 bra $L__BB1_1722; bra.uni $L__BB1_1823; $L__BB1_1793: mov.u64 %rd12846, 1; shl.b64 %rd12845, %rd10153, 32; $L__BB1_1795: mov.u64 %rd11254, 0; cvt.u32.u64 %r4241, %rd11254; cvt.u32.u64 %r4242, %rd12845; or.b32 %r4243, %r4242, %r4241; cvt.u32.u64 %r4244, %rd12846; or.b32 %r4245, %r4243, %r4244; setp.ne.s32 %p2445, %r4245, 0; @%p2445 bra $L__BB1_1823; bra.uni $L__BB1_1796; $L__BB1_1802: mov.u64 %rd12848, 1; shl.b64 %rd12847, %rd10169, 32; $L__BB1_1804: mov.u64 %rd11257, 0; cvt.u32.u64 %r4248, %rd11257; cvt.u32.u64 %r4249, %rd12847; or.b32 %r4250, %r4249, %r4248; cvt.u32.u64 %r4251, %rd12848; or.b32 %r4252, %r4250, %r4251; setp.ne.s32 %p2459, %r4252, 0; @%p2459 bra $L__BB1_1823; bra.uni $L__BB1_1805; $L__BB1_1811: mov.u64 %rd12850, 1; shl.b64 %rd12849, %rd10185, 32; $L__BB1_1813: mov.u64 %rd11260, 0; cvt.u32.u64 %r4255, %rd11260; cvt.u32.u64 %r4256, %rd12849; or.b32 %r4257, %r4256, %r4255; cvt.u32.u64 %r4258, %rd12850; or.b32 %r4259, %r4257, %r4258; setp.ne.s32 %p2473, %r4259, 0; @%p2473 bra $L__BB1_1823; bra.uni $L__BB1_1814; $L__BB1_1824: setp.eq.s32 %p2488, %r1300, 4; mov.u64 %rd12853, %rd9885; mov.u64 %rd12854, %rd9883; mov.u64 %rd12855, %rd9885; @%p2488 bra $L__BB1_1826; mov.b64 %rd12855, {%r1296, %r1297}; mov.b32 {%rs859, %rs860}, %r1298; mov.b64 %rd10219, {%r1298, %r4267}; and.b64 %rd12853, %rd10219, 4294967040; cvt.u64.u16 %rd10220, %rs859; and.b64 %rd12854, %rd10220, 255; $L__BB1_1826: or.b64 %rd10227, %rd12854, %rd12853; or.b64 %rd10228, %rd10227, %rd9885; mov.b64 {%r4268, %r4269}, %rd10228; mov.b32 {%rs178, %rs861}, %r4268; and.b16 %rs862, %rs178, 255; setp.eq.s16 %p2489, %rs862, 2; @%p2489 bra $L__BB1_1828; cvt.u32.u64 %r4270, %rd12855; mov.b32 %f4657, %r4270; shr.u64 %rd10229, %rd12855, 32; cvt.u32.u64 %r4271, %rd10229; mov.b32 %f4658, %r4271; mul.f32 %f4659, %f1254, %f4657; mul.f32 %f4660, %f1253, %f4658; sub.f32 %f4661, %f4659, %f4660; mul.f32 %f4662, %f1254, %f4658; fma.rn.f32 %f4663, %f1253, %f4657, %f4662; add.f32 %f4664, %f1251, %f4661; mov.b32 %r4272, %f4664; add.f32 %f4665, %f1252, %f4663; mov.b32 %r4273, %f4665; cvt.u64.u32 %rd10230, %r4273; cvt.u64.u32 %rd10231, %r4272; cvt.u64.u16 %rd10232, %rs178; bfi.b64 %rd9885, %rd10230, %rd10231, 32, 32; and.b64 %rd10233, %rd10232, 255; mov.b64 {%r4274, %r4275}, %rd10233; mov.b32 {%rs863, %rs864}, %r4274; cvt.u64.u16 %rd9883, %rs863; $L__BB1_1828: mov.u64 %rd11272, 0; or.b64 %rd10240, %rd11272, %rd9883; or.b64 %rd4530, %rd10240, %rd11272; mov.b64 {%r4276, %r4277}, %rd4530; mov.b32 {%rs179, %rs865}, %r4276; and.b16 %rs866, %rs179, 255; setp.eq.s16 %p2490, %rs866, 2; mov.u64 %rd12858, 2; mov.u64 %rd12859, %rd11272; mov.u64 %rd12860, %rd11272; @%p2490 bra $L__BB1_1830; and.b64 %rd10242, %rd4530, 4294967040; cvt.u64.u16 %rd10243, %rs179; and.b64 %rd10244, %rd10243, 255; or.b64 %rd10245, %rd10244, %rd11272; or.b64 %rd10246, %rd10245, %rd10242; mov.b64 {%r4278, %r4279}, %rd10246; mov.b32 {%rs867, %rs868}, %r4278; not.b16 %rs869, %rs867; ld.global.u8 %rs870, [%rd4050+-32]; setp.eq.s16 %p2491, %rs870, 0; and.b16 %rs871, %rs869, 1; selp.b16 %rs872, %rs867, %rs871, %p2491; and.b64 %rd10247, %rd10246, 4294967040; cvt.u64.u16 %rd10248, %rs872; and.b64 %rd10249, %rd10248, 255; or.b64 %rd10250, %rd10247, %rd11272; or.b64 %rd10251, %rd10250, %rd10249; mov.b64 {%r4280, %r4281}, %rd10251; mov.b32 {%rs873, %rs874}, %r4280; and.b64 %rd12860, %rd10251, 4294967040; cvt.u64.u16 %rd10252, %rs873; and.b64 %rd12858, %rd10252, 255; mov.u64 %rd12859, %rd9885; $L__BB1_1830: or.b64 %rd10253, %rd12859, %rd11272; or.b64 %rd10254, %rd11272, %rd12858; or.b64 %rd10255, %rd10254, %rd12860; or.b64 %rd10256, %rd10253, %rd11272; mov.b64 {%r5360, %r5361}, %rd10256; mov.b64 {%r5362, %r4282}, %rd10255; bra.uni $L__BB1_1887; $L__BB1_1690: cvt.u32.u64 %r4075, %rd4052; cvt.u32.u64 %r4076, %rd4077; rem.u32 %r4077, %r4076, %r4075; cvt.u64.u32 %rd12700, %r4077; $L__BB1_1691: shl.b64 %rd9790, %rd12700, 3; add.s64 %rd4081, %rd4053, %rd9790; ld.u32 %rd9791, [%rd4081]; ld.u32 %rd9792, [%rd4081+4]; bfi.b64 %rd4082, %rd9792, %rd9791, 32, 32; add.s64 %rd4083, %rd12700, 1; or.b64 %rd9793, %rd4083, %rd4052; and.b64 %rd9794, %rd9793, -4294967296; setp.eq.s64 %p2309, %rd9794, 0; @%p2309 bra $L__BB1_1693; rem.u64 %rd12701, %rd4083, %rd4052; bra.uni $L__BB1_1694; $L__BB1_1693: cvt.u32.u64 %r4078, %rd4052; cvt.u32.u64 %r4079, %rd4083; rem.u32 %r4080, %r4079, %r4078; cvt.u64.u32 %rd12701, %r4080; $L__BB1_1694: add.u64 %rd12711, %SP, 560; shl.b64 %rd9796, %rd12701, 3; add.s64 %rd4093, %rd4053, %rd9796; ld.u32 %rd9797, [%rd4093]; ld.u32 %rd9798, [%rd4093+4]; bfi.b64 %rd9799, %rd9798, %rd9797, 32, 32; st.local.v2.u64 [%rd4011], {%rd4082, %rd9799}; mov.u64 %rd12716, 2; mov.u64 %rd12702, %rd4040; mov.u64 %rd12703, %rd4038; mov.u64 %rd12704, %rd4038; mov.u64 %rd12705, %rd4039; mov.u64 %rd12706, %rd4038; mov.u64 %rd12707, %rd4038; mov.u64 %rd12708, %rd4039; mov.u64 %rd12709, %rd4011; mov.u64 %rd12710, %rd4011; mov.u64 %rd12712, %rd4011; mov.u64 %rd12713, %rd4011; mov.u64 %rd12714, %rd12711; mov.u64 %rd12715, %rd4041; $L__BB1_1695: setp.eq.s64 %p2310, %rd12716, 0; @%p2310 bra $L__BB1_1698; add.s64 %rd12716, %rd12716, -1; add.s64 %rd9800, %rd12703, 8; setp.eq.s64 %p2311, %rd12706, %rd12702; selp.b64 %rd9801, %rd9800, %rd12706, %p2311; add.s64 %rd9802, %rd12704, 8; selp.b64 %rd9803, %rd9802, %rd12707, %p2311; add.s64 %rd9804, %rd12705, 8; selp.b64 %rd9805, %rd9804, %rd12708, %p2311; setp.eq.s64 %p2312, %rd12716, 0; add.s64 %rd9806, %rd9801, 4; add.s64 %rd9807, %rd9803, 4; add.s64 %rd9808, %rd9805, 4; selp.b64 %rd4110, %rd9801, %rd9806, %p2312; selp.b64 %rd12707, %rd9803, %rd9807, %p2312; selp.b64 %rd12708, %rd9805, %rd9808, %p2312; selp.b64 %rd12703, %rd9800, %rd12703, %p2311; selp.b64 %rd12704, %rd9802, %rd12704, %p2311; selp.b64 %rd12705, %rd9804, %rd12705, %p2311; add.s64 %rd9809, %rd12706, 8; selp.b64 %rd12702, %rd9809, %rd12702, %p2311; add.s64 %rd9810, %rd12712, 8; setp.eq.s64 %p2313, %rd12709, %rd12715; selp.b64 %rd9811, %rd9810, %rd12709, %p2313; add.s64 %rd9812, %rd12713, 8; selp.b64 %rd9813, %rd9812, %rd12710, %p2313; add.s64 %rd9814, %rd12714, 8; selp.b64 %rd9815, %rd9814, %rd12711, %p2313; selp.b64 %rd12712, %rd9810, %rd12712, %p2313; selp.b64 %rd12713, %rd9812, %rd12713, %p2313; selp.b64 %rd12714, %rd9814, %rd12714, %p2313; add.s64 %rd9816, %rd12709, 8; selp.b64 %rd12715, %rd9816, %rd12715, %p2313; add.s64 %rd9817, %rd9811, 4; add.s64 %rd9818, %rd9813, 4; add.s64 %rd9819, %rd9815, 4; selp.b64 %rd12709, %rd9811, %rd9817, %p2312; selp.b64 %rd12710, %rd9813, %rd9818, %p2312; selp.b64 %rd12711, %rd9815, %rd9819, %p2312; ld.local.f32 %f4428, [%rd9813]; ld.local.f32 %f4429, [%rd9803]; setp.eq.f32 %p2314, %f4429, %f4428; mov.u64 %rd12706, %rd4110; @%p2314 bra $L__BB1_1695; bra.uni $L__BB1_1697; $L__BB1_1698: ld.u32 %rd9820, [%rd4081]; ld.u32 %rd9821, [%rd4081+4]; bfi.b64 %rd9822, %rd9821, %rd9820, 32, 32; cvt.u32.u64 %r4081, %rd9822; mov.b32 %f4430, %r4081; shr.u64 %rd9823, %rd9822, 32; cvt.u32.u64 %r4082, %rd9823; mov.b32 %f4431, %r4082; ld.u32 %rd9824, [%rd4093]; ld.u32 %rd9825, [%rd4093+4]; bfi.b64 %rd9826, %rd9825, %rd9824, 32, 32; cvt.u32.u64 %r4083, %rd9826; shr.u64 %rd9827, %rd9826, 32; cvt.u32.u64 %r4084, %rd9827; mov.b32 %f4432, %r4083; sub.f32 %f5460, %f4432, %f4430; mov.b32 %f4433, %r4084; sub.f32 %f5461, %f4433, %f4431; bra.uni $L__BB1_1709; $L__BB1_1703: cvt.u32.u64 %r4085, %rd4052; cvt.u32.u64 %r4086, %rd4124; rem.u32 %r4087, %r4086, %r4085; cvt.u64.u32 %rd12717, %r4087; $L__BB1_1704: shl.b64 %rd9836, %rd12717, 3; add.s64 %rd9837, %rd4053, %rd9836; ld.u32 %rd9838, [%rd9837]; ld.u32 %rd9839, [%rd9837+4]; bfi.b64 %rd4135, %rd9839, %rd9838, 32, 32; st.local.v2.u64 [%rd4011], {%rd4125, %rd4135}; mov.u64 %rd12732, 2; mov.u64 %rd12718, %rd4038; mov.u64 %rd12719, %rd4034; mov.u64 %rd12720, %rd4034; mov.u64 %rd12721, %rd4037; mov.u64 %rd12722, %rd4034; mov.u64 %rd12723, %rd4034; mov.u64 %rd12724, %rd4037; mov.u64 %rd12725, %rd4042; mov.u64 %rd12726, %rd4042; mov.u64 %rd12727, %rd4043; mov.u64 %rd12728, %rd4042; mov.u64 %rd12729, %rd4042; mov.u64 %rd12730, %rd4043; mov.u64 %rd12731, %rd4044; $L__BB1_1705: setp.eq.s64 %p2318, %rd12732, 0; @%p2318 bra $L__BB1_1708; add.s64 %rd12732, %rd12732, -1; add.s64 %rd9840, %rd12719, 8; setp.eq.s64 %p2319, %rd12722, %rd12718; selp.b64 %rd9841, %rd9840, %rd12722, %p2319; add.s64 %rd9842, %rd12720, 8; selp.b64 %rd9843, %rd9842, %rd12723, %p2319; add.s64 %rd9844, %rd12721, 8; selp.b64 %rd9845, %rd9844, %rd12724, %p2319; setp.eq.s64 %p2320, %rd12732, 0; add.s64 %rd9846, %rd9841, 4; add.s64 %rd9847, %rd9843, 4; add.s64 %rd9848, %rd9845, 4; selp.b64 %rd4152, %rd9841, %rd9846, %p2320; selp.b64 %rd12723, %rd9843, %rd9847, %p2320; selp.b64 %rd12724, %rd9845, %rd9848, %p2320; selp.b64 %rd12719, %rd9840, %rd12719, %p2319; selp.b64 %rd12720, %rd9842, %rd12720, %p2319; selp.b64 %rd12721, %rd9844, %rd12721, %p2319; add.s64 %rd9849, %rd12722, 8; selp.b64 %rd12718, %rd9849, %rd12718, %p2319; add.s64 %rd9850, %rd12728, 8; setp.eq.s64 %p2321, %rd12725, %rd12731; selp.b64 %rd9851, %rd9850, %rd12725, %p2321; add.s64 %rd9852, %rd12729, 8; selp.b64 %rd9853, %rd9852, %rd12726, %p2321; add.s64 %rd9854, %rd12730, 8; selp.b64 %rd9855, %rd9854, %rd12727, %p2321; selp.b64 %rd12728, %rd9850, %rd12728, %p2321; selp.b64 %rd12729, %rd9852, %rd12729, %p2321; selp.b64 %rd12730, %rd9854, %rd12730, %p2321; add.s64 %rd9856, %rd12725, 8; selp.b64 %rd12731, %rd9856, %rd12731, %p2321; add.s64 %rd9857, %rd9851, 4; add.s64 %rd9858, %rd9853, 4; add.s64 %rd9859, %rd9855, 4; selp.b64 %rd12725, %rd9851, %rd9857, %p2320; selp.b64 %rd12726, %rd9853, %rd9858, %p2320; selp.b64 %rd12727, %rd9855, %rd9859, %p2320; ld.local.f32 %f4434, [%rd9853]; ld.local.f32 %f4435, [%rd9843]; setp.eq.f32 %p2322, %f4435, %f4434; mov.u64 %rd12722, %rd4152; @%p2322 bra $L__BB1_1705; bra.uni $L__BB1_1707; $L__BB1_1708: cvt.u32.u64 %r4088, %rd4125; mov.b32 %f4436, %r4088; shr.u64 %rd9860, %rd4125, 32; cvt.u32.u64 %r4089, %rd9860; mov.b32 %f4437, %r4089; shr.u64 %rd9861, %rd4135, 32; cvt.u32.u64 %r4090, %rd9861; cvt.u32.u64 %r4091, %rd4135; mov.b32 %f4438, %r4091; sub.f32 %f4439, %f4438, %f4436; mov.b32 %f4440, %r4090; sub.f32 %f4441, %f4440, %f4437; neg.f32 %f5460, %f4439; neg.f32 %f5461, %f4441; $L__BB1_1709: mul.f32 %f4442, %f1243, %f5461; fma.rn.f32 %f1250, %f1242, %f5460, %f4442; mul.f32 %f4443, %f5461, %f5461; fma.rn.f32 %f4444, %f5460, %f5460, %f4443; add.f32 %f4445, %f4444, 0f00000000; sqrt.rn.f32 %f4446, %f4445; mul.f32 %f4447, %f4446, 0f3A83126F; abs.f32 %f4448, %f1250; setp.gt.f32 %p2323, %f4448, %f4447; @%p2323 bra $L__BB1_1711; bra.uni $L__BB1_1710; $L__BB1_1711: setp.ge.f32 %p2951, %f1250, 0f00000000; bra.uni $L__BB1_1714; $L__BB1_1710: ld.local.u64 %rd9862, [%rd4015+8]; cvt.u32.u64 %r4092, %rd9862; mov.b32 %f4449, %r4092; shr.u64 %rd9863, %rd9862, 32; cvt.u32.u64 %r4093, %rd9863; mov.b32 %f4450, %r4093; sub.f32 %f4451, %f1208, %f4449; sub.f32 %f4452, %f1209, %f4450; mul.f32 %f4453, %f1243, %f4452; fma.rn.f32 %f4454, %f1242, %f4451, %f4453; setp.le.f32 %p2951, %f4454, 0f00000000; $L__BB1_1714: selp.u16 %rs810, 1, 0, %p2951; st.local.u8 [%rd4015+16], %rs810; $L__BB1_1715: setp.eq.s32 %p2952, %r1282, 2; ld.local.v2.u32 {%r5299, %r5300}, [%rd4015+8]; ld.local.u32 %r5301, [%rd4015+16]; $L__BB1_1717: mov.u64 %rd9871, 0; mov.u64 %rd12733, 2; mov.u64 %rd12734, %rd9871; @%p2952 bra $L__BB1_1719; setp.ne.s16 %p2324, %rs163, 0; cvt.u16.u32 %rs812, %r5301; selp.u16 %rs813, 1, 0, %p2324; xor.b16 %rs814, %rs812, %rs813; mov.b32 %f4461, %r5299; mov.b32 %f4462, %r5300; mul.f32 %f4463, %f1213, %f4461; mul.f32 %f4464, %f1212, %f4462; sub.f32 %f4465, %f4463, %f4464; mul.f32 %f4466, %f1212, %f4461; fma.rn.f32 %f4467, %f1213, %f4462, %f4466; add.f32 %f4468, %f1210, %f4465; mov.b32 %r4098, %f4468; add.f32 %f4469, %f1211, %f4467; mov.b32 %r4099, %f4469; cvt.u64.u32 %rd9872, %r4099; cvt.u64.u32 %rd9873, %r4098; cvt.u64.u16 %rd9874, %rs814; bfi.b64 %rd12734, %rd9872, %rd9873, 32, 32; and.b64 %rd9875, %rd9874, 255; mov.b64 {%r4100, %r4101}, %rd9875; mov.b32 {%rs815, %rs816}, %r4100; cvt.u64.u16 %rd12733, %rs815; $L__BB1_1719: or.b64 %rd9876, %rd9871, %rd9871; or.b64 %rd9877, %rd12733, %rd9871; or.b64 %rd9878, %rd9877, %rd9871; or.b64 %rd9879, %rd9876, %rd12734; mov.b64 {%r5360, %r5361}, %rd9879; mov.b64 {%r5362, %r4102}, %rd9878; $L__BB1_1887: mov.b32 {%rs184, %rs884}, %r5362; and.b16 %rs885, %rs184, 255; setp.eq.s16 %p2569, %rs885, 2; @%p2569 bra $L__BB1_1889; mov.b64 %rd10331, {%r5362, %r4349}; shr.u64 %rd10332, %rd10331, 8; and.b64 %rd10333, %rd10332, 16777215; cvt.u64.u16 %rd10334, %rs184; and.b64 %rd10335, %rd10334, 255; mov.b64 %rd9757, {%r5360, %r5361}; bfi.b64 %rd4584, %rd10333, %rd10335, 8, 56; mov.b64 {%r4022, %r4350}, %rd4584; $L__BB1_1889: mov.b32 {%rs886, %rs887}, %r4022; and.b16 %rs888, %rs886, 255; setp.eq.s16 %p2570, %rs888, 2; cvt.u64.u16 %rd10336, %rs886; and.b64 %rd10337, %rd10336, 255; selp.b64 %rd10338, 2, %rd10337, %p2570; mov.b64 %rd10339, {%r4022, %r4351}; and.b64 %rd10340, %rd10339, 4294967040; or.b64 %rd4590, %rd10340, %rd10338; mov.b64 {%r4352, %r4353}, %rd4590; mov.b32 {%rs185, %rs889}, %r4352; and.b16 %rs890, %rs185, 255; setp.eq.s16 %p2571, %rs890, 2; @%p2571 bra $L__BB1_1891; bra.uni $L__BB1_1890; $L__BB1_1891: setp.ne.s64 %p2572, %rd4049, 0; add.s64 %rd12688, %rd4047, 280; add.s64 %rd12689, %rd4048, 280; @%p2572 bra $L__BB1_1658; $L__BB1_1892: add.s64 %rd4640, %rd4047, 280; add.s64 %rd4642, %rd4048, 280; mov.u64 %rd9757, %rd9740; bra.uni $L__BB1_1893; $L__BB1_1890: add.s64 %rd4640, %rd4047, 280; add.s64 %rd4642, %rd4048, 280; shl.b64 %rd10341, %rd4590, 16; shr.u64 %rd10342, %rd10341, 24; cvt.u64.u16 %rd10343, %rs185; and.b64 %rd10344, %rd10343, 255; bfi.b64 %rd10345, %rd10342, %rd10344, 8, 56; mov.b64 {%r4019, %r4354}, %rd10345; $L__BB1_1893: mov.b32 {%rs891, %rs892}, %r4019; and.b16 %rs893, %rs891, 255; setp.eq.s16 %p2573, %rs893, 2; cvt.u64.u16 %rd10348, %rs891; and.b64 %rd10349, %rd10348, 255; selp.b64 %rd10350, 2, %rd10349, %p2573; mov.b64 %rd10351, {%r4019, %r4358}; and.b64 %rd10352, %rd10351, 4294967040; or.b64 %rd10353, %rd10352, %rd9740; or.b64 %rd4604, %rd10353, %rd10350; mov.b64 {%r4359, %r4360}, %rd4604; mov.b32 {%rs186, %rs894}, %r4359; and.b16 %rs895, %rs186, 255; setp.eq.s16 %p2574, %rs895, 2; mov.f32 %f4766, 0f00000000; @%p2574 bra $L__BB1_2129; and.b64 %rd10354, %rd4604, 4294967040; cvt.u64.u16 %rd10355, %rs186; and.b64 %rd10356, %rd10355, 255; or.b64 %rd10357, %rd10356, %rd9740; or.b64 %rd10358, %rd10357, %rd10354; mov.b64 {%r4361, %r4362}, %rd10358; mov.b32 {%rs896, %rs897}, %r4361; shr.u64 %rd10359, %rd9757, 32; cvt.u32.u64 %r4363, %rd10359; cvt.u32.u64 %r4364, %rd9757; mov.b32 %f4767, %r4364; sub.f32 %f4768, %f4767, %f1208; mov.b32 %f4769, %r4363; sub.f32 %f4770, %f4769, %f1209; mul.f32 %f4771, %f4770, %f4770; fma.rn.f32 %f4772, %f4768, %f4768, %f4771; add.f32 %f4773, %f4772, 0f00000000; sqrt.rn.f32 %f4774, %f4773; and.b16 %rs898, %rs896, 1; setp.eq.b16 %p2575, %rs898, 1; selp.f32 %f4775, 0fBF800000, 0f3F800000, %p2575; mul.f32 %f1387, %f4775, %f4774; setp.eq.s64 %p2576, %rd4642, 0; setp.eq.s64 %p2577, %rd4049, 0; or.pred %p2578, %p2576, %p2577; @%p2578 bra $L__BB1_2127; add.u64 %rd10360, %SP, 560; add.u64 %rd4605, %SPL, 560; add.u64 %rd10364, %SP, 32; add.u64 %rd4609, %SPL, 32; add.s64 %rd4611, %rd4605, 8; add.u64 %rd10367, %SP, 0; add.u64 %rd4613, %SPL, 0; add.s64 %rd4614, %rd4613, 8; add.s64 %rd4616, %rd4613, 8; add.s64 %rd4618, %rd4613, 8; add.s64 %rd4620, %rd4613, 8; add.s64 %rd4622, %rd4613, 8; add.s64 %rd4624, %rd4613, 8; add.u64 %rd10373, %SP, 552; add.u64 %rd4625, %SPL, 552; add.s64 %rd4626, %rd4625, 8; add.s64 %rd4628, %rd4609, 36; add.s64 %rd4630, %rd4609, 4; add.s64 %rd4631, %rd10364, 36; add.s64 %rd4632, %rd4609, 44; add.s64 %rd4633, %rd10364, 44; add.s64 %rd4634, %rd4609, 52; add.s64 %rd4635, %rd4605, 8; add.s64 %rd4636, %rd4605, 8; or.b64 %rd4637, %rd10360, 8; add.s64 %rd4638, %rd4605, 16; mov.u64 %rd4641, %rd4640; $L__BB1_1896: add.s64 %rd4049, %rd4049, -1; ld.global.u32 %r4365, [%rd4640+272]; setp.eq.s32 %p2579, %r4365, 3; @%p2579 bra $L__BB1_2126; ld.global.u16 %rs899, [%rd4641]; setp.eq.s16 %p2580, %rs899, 1; @%p2580 bra $L__BB1_2068; setp.eq.s16 %p2581, %rs899, 2; @%p2581 bra $L__BB1_1957; setp.ne.s16 %p2582, %rs899, 3; @%p2582 bra $L__BB1_2106; ld.global.u8 %rs187, [%rd4641+24]; ld.global.f32 %f1388, [%rd4641+256]; sub.f32 %f4776, %f1208, %f1388; ld.global.f32 %f1389, [%rd4641+260]; sub.f32 %f4777, %f1209, %f1389; ld.global.f32 %f1390, [%rd4641+252]; ld.global.f32 %f1391, [%rd4641+248]; mul.f32 %f4778, %f4777, %f1390; fma.rn.f32 %f1392, %f4776, %f1391, %f4778; mul.f32 %f4779, %f4776, %f1390; mul.f32 %f4780, %f4777, %f1391; sub.f32 %f1393, %f4780, %f4779; mov.u32 %r1469, 2; st.local.u32 [%rd4609+20], %r1469; ld.global.u64 %rd4646, [%rd4641+16]; setp.eq.s64 %p2584, %rd4646, 0; mov.pred %p2959, -1; @%p2584 bra $L__BB1_1954; mov.b32 %r4381, %f1393; ld.global.u64 %rd4647, [%rd4641+8]; mov.b32 %r4382, %f1392; and.b32 %r4383, %r4382, 2147483647; mov.b32 %f1394, %r4383; and.b32 %r4384, %r4381, 2147483647; mov.b32 %f1395, %r4384; mov.u64 %rd12886, 1; bra.uni $L__BB1_1902; $L__BB1_1910: sub.f32 %f4792, %f5477, %f1392; abs.f32 %f1410, %f4792; setp.le.f32 %p2594, %f1410, 0f34000000; @%p2594 bra $L__BB1_1912; abs.f32 %f4793, %f5477; abs.f32 %f4794, %f1392; setp.gt.f32 %p2596, %f4794, %f4793; selp.f32 %f4795, %f4794, %f4793, %p2596; mul.f32 %f4796, %f4795, 0f34000000; setp.gtu.f32 %p2597, %f1410, %f4796; @%p2597 bra $L__BB1_1916; bra.uni $L__BB1_1912; $L__BB1_1902: shl.b64 %rd10378, %rd12886, 3; add.s64 %rd10379, %rd4647, %rd10378; setp.eq.s64 %p2585, %rd12886, %rd4646; selp.b64 %rd10380, 0, %rd12886, %p2585; shl.b64 %rd10381, %rd10380, 3; add.s64 %rd10382, %rd4647, %rd10381; ld.u32 %rd10383, [%rd10382]; ld.u32 %rd10384, [%rd10382+4]; bfi.b64 %rd4654, %rd10384, %rd10383, 32, 32; ld.u32 %rd10385, [%rd10379+-8]; ld.u32 %rd10386, [%rd10379+-4]; bfi.b64 %rd4655, %rd10386, %rd10385, 32, 32; cvt.u32.u64 %r5367, %rd4655; mov.b32 %f5477, %r5367; shr.u64 %rd10387, %rd4655, 32; cvt.u32.u64 %r4388, %rd10387; mov.b32 %f1398, %r4388; cvt.u32.u64 %r1452, %rd4654; shr.u64 %rd10388, %rd4654, 32; cvt.u32.u64 %r4389, %rd10388; mov.b32 %f1399, %r1452; sub.f32 %f1400, %f1399, %f5477; mov.b32 %f4782, %r4389; sub.f32 %f1401, %f4782, %f1398; sub.f32 %f4783, %f1392, %f5477; sub.f32 %f4784, %f1393, %f1398; mul.f32 %f4785, %f1401, %f4784; fma.rn.f32 %f1402, %f1400, %f4783, %f4785; mul.f32 %f4786, %f1401, %f1401; fma.rn.f32 %f4787, %f1400, %f1400, %f4786; add.f32 %f1403, %f4787, 0f00000000; setp.gtu.f32 %p2586, %f1402, 0f00000000; mov.b64 {%r4390, %r5368}, %rd4655; mov.b64 {%r4391, %r1454}, %rd4654; @%p2586 bra $L__BB1_1904; bra.uni $L__BB1_1903; $L__BB1_1904: setp.ltu.f32 %p2587, %f1402, %f1403; @%p2587 bra $L__BB1_1906; bra.uni $L__BB1_1905; $L__BB1_1906: setp.eq.f32 %p2588, %f1403, 0f00000000; @%p2588 bra $L__BB1_1953; div.rn.f32 %f4788, %f1402, %f1403; mov.f32 %f4789, 0f3F800000; sub.f32 %f4790, %f4789, %f4788; mov.b32 %r5370, %f4790; mov.b32 %r5371, %f4788; fma.rn.f32 %f5477, %f1400, %f4788, %f5477; mov.b32 %r5367, %f5477; fma.rn.f32 %f5478, %f1401, %f4788, %f1398; mov.b32 %r5368, %f5478; mov.u32 %r5369, 1; bra.uni $L__BB1_1908; $L__BB1_1903: mov.b32 %f5478, %r5368; mov.u32 %r5369, 0; mov.u32 %r5370, %r5369; bra.uni $L__BB1_1908; $L__BB1_1905: mov.b32 %f5478, %r1454; mov.u32 %r5370, 1; mov.u32 %r5369, 0; mov.f32 %f5477, %f1399; mov.u32 %r5367, %r1452; mov.u32 %r5368, %r1454; $L__BB1_1908: setp.eq.f32 %p2589, %f1392, %f5477; @%p2589 bra $L__BB1_1912; bra.uni $L__BB1_1909; $L__BB1_1912: setp.eq.f32 %p2599, %f5478, %f1393; mov.pred %p2598, -1; mov.pred %p2957, %p2598; @%p2599 bra $L__BB1_1916; setp.eq.f32 %p2601, %f1395, 0f7F800000; and.b32 %r4400, %r5368, 2147483647; mov.b32 %f4797, %r4400; setp.eq.f32 %p2602, %f4797, 0f7F800000; or.pred %p2603, %p2601, %p2602; mov.pred %p2957, 0; @%p2603 bra $L__BB1_1916; sub.f32 %f4798, %f5478, %f1393; abs.f32 %f1411, %f4798; setp.le.f32 %p2605, %f1411, 0f34000000; mov.pred %p2957, %p2598; @%p2605 bra $L__BB1_1916; abs.f32 %f4799, %f5478; abs.f32 %f4800, %f1393; setp.gt.f32 %p2606, %f4800, %f4799; selp.f32 %f4801, %f4800, %f4799, %p2606; mul.f32 %f4802, %f4801, 0f34000000; setp.le.f32 %p2957, %f1411, %f4802; bra.uni $L__BB1_1916; $L__BB1_1909: setp.eq.f32 %p2591, %f1394, 0f7F800000; and.b32 %r4399, %r5367, 2147483647; mov.b32 %f4791, %r4399; setp.eq.f32 %p2592, %f4791, 0f7F800000; or.pred %p2593, %p2591, %p2592; mov.pred %p2957, 0; @%p2593 bra $L__BB1_1916; bra.uni $L__BB1_1910; $L__BB1_1916: cvt.u64.u32 %rd10389, %r5368; cvt.u64.u32 %rd10390, %r5367; bfi.b64 %rd4656, %rd10389, %rd10390, 32, 32; mov.b64 {%r4401, %r4402}, %rd4656; selp.u64 %rd4657, 1, 0, %p2957; mov.b32 %f1413, %r4402; mov.b32 %f1412, %r4401; sub.f32 %f4803, %f1412, %f1392; sub.f32 %f4804, %f1413, %f1393; mul.f32 %f4805, %f4804, %f4804; fma.rn.f32 %f4806, %f4803, %f4803, %f4805; add.f32 %f4807, %f4806, 0f00000000; sqrt.rn.f32 %f1415, %f4807; setp.geu.f32 %p2607, %f1415, %f5479; setp.ne.s32 %p2608, %r1469, 2; and.pred %p2609, %p2608, %p2607; @%p2609 bra $L__BB1_1918; add.s64 %rd12887, %rd12886, -1; st.local.u64 [%rd4609], %rd12887; st.local.v2.f32 [%rd4609+8], {%f1412, %f1413}; mov.b64 {%r4405, %r4406}, %rd4657; st.local.v2.u32 [%rd4609+16], {%r4405, %r5369}; st.local.v2.u32 [%rd4609+24], {%r5370, %r5371}; st.local.f32 [%rd4609+32], %f1415; st.local.u32 [%rd4609+36], %rd4655; st.local.u32 [%rd4609+44], %rd4654; st.local.u32 [%rd4609+40], %rd10387; st.local.u32 [%rd4609+48], %rd10388; mov.u32 %r5372, %r5370; mov.u64 %rd12888, %rd4655; mov.u64 %rd12889, %rd4654; mov.f32 %f5479, %f1415; mov.u32 %r1469, %r5369; $L__BB1_1918: add.s64 %rd4662, %rd12886, 1; setp.lt.u64 %p2610, %rd12886, %rd4646; mov.u64 %rd12886, %rd4662; @%p2610 bra $L__BB1_1902; cvt.u32.u64 %r4407, %rd12888; mov.b32 %f4808, %r4407; shr.u64 %rd10397, %rd12888, 32; cvt.u32.u64 %r4408, %rd10397; mov.b32 %f4809, %r4408; shr.u64 %rd10398, %rd12889, 32; cvt.u32.u64 %r4409, %rd10398; cvt.u32.u64 %r4410, %rd12889; mov.b32 %f4810, %r4410; sub.f32 %f1417, %f4810, %f4808; mov.b32 %f4811, %r4409; sub.f32 %f1418, %f4811, %f4809; mul.f32 %f4812, %f1418, %f1418; fma.rn.f32 %f4813, %f1417, %f1417, %f4812; add.f32 %f1419, %f4813, 0f00000000; setp.leu.f32 %p2611, %f1419, 0f28800000; mov.u64 %rd10396, 0; mov.u64 %rd12890, %rd10396; mov.u64 %rd12891, %rd10396; mov.u64 %rd12892, %rd10396; @%p2611 bra $L__BB1_1921; neg.f32 %f4814, %f1417; sqrt.rn.f32 %f4815, %f1419; div.rn.f32 %f4816, %f1418, %f4815; div.rn.f32 %f4817, %f4814, %f4815; mov.b32 %r4411, %f4817; mov.b32 %r4412, %f4816; mov.u64 %rd12892, 1; mov.b64 %rd10401, {%r4412, %r4411}; shr.u64 %rd12891, %rd10401, 32; shl.b64 %rd12890, %rd10401, 32; $L__BB1_1921: or.b64 %rd4669, %rd12892, %rd12890; or.b64 %rd4670, %rd10396, %rd12891; and.b64 %rd10402, %rd10396, 4294967295; xor.b64 %rd10403, %rd12892, 1; or.b64 %rd10404, %rd10403, %rd10402; setp.ne.s64 %p2612, %rd10404, 0; @%p2612 bra $L__BB1_1952; mov.b64 {%r4413, %r4414}, %rd4670; mov.b64 {%r4415, %r4416}, %rd4669; mov.b32 %f1420, %r4416; mov.b32 %f1421, %r4413; setp.eq.s32 %p2613, %r1469, 1; @%p2613 bra $L__BB1_1950; bra.uni $L__BB1_1923; $L__BB1_1950: ld.local.u64 %rd10481, [%rd4609+8]; cvt.u32.u64 %r4436, %rd10481; mov.b32 %f4845, %r4436; shr.u64 %rd10482, %rd10481, 32; cvt.u32.u64 %r4437, %rd10482; mov.b32 %f4846, %r4437; sub.f32 %f4847, %f1208, %f4845; sub.f32 %f4848, %f1209, %f4846; mul.f32 %f4849, %f1421, %f4848; fma.rn.f32 %f4850, %f1420, %f4847, %f4849; setp.le.f32 %p2958, %f4850, 0f00000000; bra.uni $L__BB1_1951; $L__BB1_1957: ld.global.f32 %f1429, [%rd4641+256]; mov.u64 %rd10502, 0; sub.f32 %f4860, %f1208, %f1429; ld.global.f32 %f1430, [%rd4641+260]; sub.f32 %f4861, %f1209, %f1430; ld.global.f32 %f1431, [%rd4641+252]; ld.global.f32 %f1432, [%rd4641+248]; mul.f32 %f4862, %f4861, %f1431; fma.rn.f32 %f1433, %f4860, %f1432, %f4862; mul.f32 %f4863, %f4860, %f1431; mul.f32 %f4864, %f4861, %f1432; sub.f32 %f1434, %f4864, %f4863; mov.b32 %r4445, %f1433; mov.b32 %r4446, %f1434; cvt.u64.u32 %rd10503, %r4446; cvt.u64.u32 %rd10504, %r4445; bfi.b64 %rd10505, %rd10503, %rd10504, 32, 32; st.local.u64 [%rd4625], %rd10505; ld.global.u64 %rd4772, [%rd4641+32]; setp.eq.s64 %p2633, %rd4772, 0; mov.u64 %rd10500, 2; mov.u64 %rd13046, %rd10502; mov.u64 %rd13047, %rd10500; mov.u64 %rd13048, %rd10502; @%p2633 bra $L__BB1_2063; mov.u32 %r4453, 0; st.local.u32 [%rd4609], %r4453; mov.u32 %r4454, -16777217; st.local.u32 [%rd4609+4], %r4454; mov.u32 %r1490, 1; st.local.u32 [%rd4609+512], %r1490; ld.global.u64 %rd4774, [%rd4641+24]; ld.global.u64 %rd4775, [%rd4641+80]; ld.global.u64 %rd4776, [%rd4641+72]; mov.u32 %r1488, 2139095039; mov.u32 %r1487, 4; bra.uni $L__BB1_1959; $L__BB1_2068: ld.global.f32 %f1500, [%rd4641+256]; sub.f32 %f5056, %f1208, %f1500; ld.global.f32 %f1501, [%rd4641+260]; sub.f32 %f5057, %f1209, %f1501; ld.global.f32 %f1502, [%rd4641+252]; ld.global.f32 %f1503, [%rd4641+248]; mul.f32 %f5058, %f5057, %f1502; fma.rn.f32 %f1504, %f5056, %f1503, %f5058; mul.f32 %f5059, %f5056, %f1502; mul.f32 %f5060, %f5057, %f1503; sub.f32 %f1505, %f5060, %f5059; mov.b32 %r1591, %f1504; mov.b32 %r1592, %f1505; ld.global.v2.f32 {%f5061, %f5062}, [%rd4641+56]; ld.global.v2.f32 {%f5063, %f5064}, [%rd4641+48]; sub.f32 %f5065, %f1504, %f6; sub.f32 %f5066, %f1505, %f6; mov.b32 %r4625, %f5065; mov.b32 %r4626, %f5066; cvt.u64.u32 %rd10874, %r4626; cvt.u64.u32 %rd10875, %r4625; add.f32 %f5067, %f6, %f1504; add.f32 %f5068, %f6, %f1505; mov.b32 %r4627, %f5067; mov.b32 %r4628, %f5068; cvt.u64.u32 %rd10876, %r4628; cvt.u64.u32 %rd10877, %r4627; bfi.b64 %rd10878, %rd10874, %rd10875, 32, 32; mov.b64 {%r4629, %r4630}, %rd10878; bfi.b64 %rd10879, %rd10876, %rd10877, 32, 32; mov.b64 {%r4631, %r4632}, %rd10879; cvta.to.local.u64 %rd5135, %rd10364; mov.u16 %rs965, 2; st.local.u8 [%rd5135+8], %rs965; mov.b32 %f1513, %r4632; mov.b32 %f1511, %r4630; mov.b32 %f1512, %r4631; mov.b32 %f1510, %r4629; ld.global.v2.f32 {%f5069, %f5070}, [%rd4641+40]; div.rn.f32 %f1516, %f1510, %f5069; div.rn.f32 %f1517, %f1512, %f5069; ld.global.u64 %rd5136, [%rd4641+16]; cvt.rn.f32.u64 %f5071, %rd5136; add.f32 %f5072, %f5071, 0fBF800000; rcp.rn.f32 %f1518, %f5072; setp.lt.f32 %p2800, %f1517, 0fBF000000; setp.gt.f32 %p2801, %f1516, 0f3F000000; or.pred %p2802, %p2801, %p2800; @%p2802 bra $L__BB1_2100; add.f32 %f5073, %f1516, 0f3F000000; div.rn.f32 %f5074, %f5073, %f1518; cvt.rmi.f32.f32 %f5075, %f5074; add.s64 %rd10881, %rd5136, -2; cvt.rn.f32.u64 %f5076, %rd10881; setp.gt.f32 %p2803, %f5075, 0f00000000; setp.lt.f32 %p2804, %f5075, %f5076; selp.f32 %f5077, %f5075, %f5076, %p2804; selp.f32 %f5078, %f5077, 0f00000000, %p2803; setp.gt.f32 %p2805, %f5078, 0f5F7FFFFF; max.f32 %f5079, %f5078, 0f00000000; cvt.rzi.u64.f32 %rd10882, %f5079; selp.b64 %rd5142, -1, %rd10882, %p2805; add.f32 %f5080, %f1517, 0f3F000000; div.rn.f32 %f5081, %f5080, %f1518; cvt.rpi.f32.f32 %f5082, %f5081; add.s64 %rd10883, %rd5136, -1; cvt.rn.f32.u64 %f5083, %rd10883; setp.gt.f32 %p2806, %f5082, 0f00000000; setp.lt.f32 %p2807, %f5082, %f5083; selp.f32 %f5084, %f5082, %f5083, %p2807; selp.f32 %f5085, %f5084, 0f00000000, %p2806; setp.gt.f32 %p2808, %f5085, 0f5F7FFFFF; max.f32 %f5086, %f5085, 0f00000000; cvt.rzi.u64.f32 %rd10884, %f5086; selp.b64 %rd5138, -1, %rd10884, %p2808; setp.ge.u64 %p2809, %rd5142, %rd5138; @%p2809 bra $L__BB1_2100; div.rn.f32 %f1519, %f1511, %f5070; div.rn.f32 %f1520, %f1513, %f5070; ld.global.u64 %rd5139, [%rd4641+32]; ld.global.u64 %rd5140, [%rd4641+24]; ld.global.u64 %rd5141, [%rd4641+8]; and.b32 %r4633, %r1591, 2147483647; mov.b32 %f1521, %r4633; and.b32 %r4634, %r1592, 2147483647; mov.b32 %f1522, %r4634; ld.local.v4.u32 {%r5432, %r5433, %r5434, %r4638}, [%rd5135]; mov.f32 %f5491, 0f7F7FFFFF; bra.uni $L__BB1_2071; $L__BB1_2106: ld.global.f32 %f1547, [%rd4641+256]; sub.f32 %f5125, %f1208, %f1547; ld.global.f32 %f1548, [%rd4641+260]; sub.f32 %f5126, %f1209, %f1548; ld.global.f32 %f1549, [%rd4641+252]; ld.global.f32 %f1550, [%rd4641+248]; mul.f32 %f5127, %f5126, %f1549; fma.rn.f32 %f1551, %f5125, %f1550, %f5127; mul.f32 %f5128, %f5125, %f1549; mul.f32 %f5129, %f5126, %f1550; sub.f32 %f1552, %f5129, %f5128; ld.global.u32 %rd10911, [%rd4641+8]; ld.global.u32 %rd10912, [%rd4641+12]; bfi.b64 %rd10913, %rd10912, %rd10911, 32, 32; cvt.u32.u64 %r4673, %rd10913; mov.b32 %f5130, %r4673; shr.u64 %rd10914, %rd10913, 32; cvt.u32.u64 %r4674, %rd10914; mov.b32 %f5131, %r4674; neg.f32 %f5132, %f5130; neg.f32 %f5133, %f5131; sub.f32 %f1553, %f5132, %f1551; sub.f32 %f1554, %f5133, %f1552; sub.f32 %f1555, %f1551, %f5130; sub.f32 %f1556, %f1552, %f5131; setp.ge.f32 %p2858, %f1553, 0f00000000; selp.f32 %f5134, %f1553, 0f00000000, %p2858; setp.ge.f32 %p2859, %f1554, 0f00000000; selp.f32 %f5135, %f1554, 0f00000000, %p2859; setp.ge.f32 %p2860, %f1555, 0f00000000; selp.f32 %f5136, %f1555, 0f00000000, %p2860; setp.ge.f32 %p2861, %f1556, 0f00000000; selp.f32 %f5137, %f1556, 0f00000000, %p2861; sub.f32 %f1557, %f5134, %f5136; mov.b32 %r4675, %f1557; sub.f32 %f1558, %f5135, %f5137; mov.b32 %r4676, %f1558; cvt.u64.u32 %rd10915, %r4676; cvt.u64.u32 %rd10916, %r4675; bfi.b64 %rd10917, %rd10915, %rd10916, 32, 32; st.local.u64 [%rd4605], %rd10917; mov.u64 %rd13062, 2; mov.u64 %rd13055, %rd4611; mov.u64 %rd13056, %rd4605; mov.u64 %rd13057, %rd4605; mov.u64 %rd13058, %rd10360; mov.u64 %rd13059, %rd4605; mov.u64 %rd13060, %rd4605; mov.u64 %rd13061, %rd10360; $L__BB1_2107: setp.eq.s64 %p2862, %rd13062, 0; @%p2862 bra $L__BB1_2110; add.s64 %rd13062, %rd13062, -1; add.s64 %rd10918, %rd13059, 8; setp.eq.s64 %p2863, %rd13059, %rd13055; selp.b64 %rd13055, %rd10918, %rd13055, %p2863; add.s64 %rd10919, %rd13056, 8; selp.b64 %rd13056, %rd10919, %rd13056, %p2863; add.s64 %rd10920, %rd13057, 8; selp.b64 %rd13057, %rd10920, %rd13057, %p2863; add.s64 %rd10921, %rd13058, 8; selp.b64 %rd13058, %rd10921, %rd13058, %p2863; selp.b64 %rd10922, %rd10919, %rd13059, %p2863; selp.b64 %rd10923, %rd10920, %rd13060, %p2863; selp.b64 %rd10924, %rd10921, %rd13061, %p2863; setp.eq.s64 %p2864, %rd13062, 0; add.s64 %rd10925, %rd10922, 4; add.s64 %rd10926, %rd10923, 4; add.s64 %rd10927, %rd10924, 4; selp.b64 %rd13059, %rd10922, %rd10925, %p2864; selp.b64 %rd13060, %rd10923, %rd10926, %p2864; selp.b64 %rd13061, %rd10924, %rd10927, %p2864; ld.local.f32 %f5138, [%rd10923]; setp.eq.f32 %p2865, %f5138, 0f00000000; @%p2865 bra $L__BB1_2107; add.f32 %f5139, %f1551, %f1557; mov.b32 %r4677, %f5139; add.f32 %f5140, %f1552, %f1558; mov.b32 %r4678, %f5140; cvt.u64.u32 %rd10930, %r4678; cvt.u64.u32 %rd10931, %r4677; bfi.b64 %rd13066, %rd10930, %rd10931, 32, 32; mov.u64 %rd13065, 0; bra.uni $L__BB1_2123; $L__BB1_2110: setp.lt.f32 %p2866, %f1553, %f1555; mov.f32 %f5492, 0fFF7FFFFF; @%p2866 bra $L__BB1_2113; bra.uni $L__BB1_2111; $L__BB1_2113: setp.leu.f32 %p2871, %f1555, 0fFF7FFFFF; mov.pred %p2963, 0; @%p2871 bra $L__BB1_2115; mov.f32 %f5492, %f1555; bra.uni $L__BB1_2115; $L__BB1_2111: setp.leu.f32 %p2868, %f1553, 0fFF7FFFFF; mov.pred %p2963, 0; @%p2868 bra $L__BB1_2115; mov.pred %p2963, -1; mov.f32 %f5492, %f1553; $L__BB1_2115: setp.lt.f32 %p2873, %f1554, %f1556; @%p2873 bra $L__BB1_2118; bra.uni $L__BB1_2116; $L__BB1_2118: setp.gt.f32 %p2875, %f1556, %f5492; @%p2875 bra $L__BB1_2121; bra.uni $L__BB1_2119; $L__BB1_2121: mov.u64 %rd10934, 0; st.local.u64 [%rd4609], %rd10934; neg.f32 %f5494, %f1556; mov.u64 %rd13064, %rd4630; bra.uni $L__BB1_2122; $L__BB1_2116: setp.leu.f32 %p2874, %f1554, %f5492; @%p2874 bra $L__BB1_2119; mov.u64 %rd10932, 0; st.local.u64 [%rd4609], %rd10932; mov.u64 %rd13064, %rd4630; mov.f32 %f5492, %f1554; bra.uni $L__BB1_2120; $L__BB1_2119: mov.u64 %rd10933, 0; st.local.u64 [%rd4609], %rd10933; neg.f32 %f5494, %f5492; not.pred %p2876, %p2963; mov.u64 %rd13064, %rd4609; @%p2876 bra $L__BB1_2122; $L__BB1_2120: mov.f32 %f5494, %f5492; $L__BB1_2122: st.local.f32 [%rd13064], %f5494; ld.local.u64 %rd10937, [%rd4609]; cvt.u32.u64 %r4679, %rd10937; mov.b32 %f5143, %r4679; shr.u64 %rd10938, %rd10937, 32; cvt.u32.u64 %r4680, %rd10938; mov.b32 %f5144, %r4680; add.f32 %f5145, %f1551, %f5143; add.f32 %f5146, %f1552, %f5144; mov.b32 %r4681, %f5145; mov.b32 %r4682, %f5146; cvt.u64.u32 %rd10939, %r4682; cvt.u64.u32 %rd10940, %r4681; bfi.b64 %rd13066, %rd10939, %rd10940, 32, 32; mov.u64 %rd13065, 1; $L__BB1_2123: mov.u64 %rd11311, 0; cvt.u32.u64 %r4683, %rd13066; mov.b32 %f5147, %r4683; shr.u64 %rd10941, %rd13066, 32; cvt.u32.u64 %r4684, %rd10941; mov.b32 %f5148, %r4684; mul.f32 %f5149, %f1550, %f5147; mul.f32 %f5150, %f1549, %f5148; sub.f32 %f5151, %f5149, %f5150; mul.f32 %f5152, %f1550, %f5148; fma.rn.f32 %f5153, %f1549, %f5147, %f5152; add.f32 %f5154, %f1547, %f5151; mov.b32 %r4685, %f5154; add.f32 %f5155, %f1548, %f5153; mov.b32 %r4686, %f5155; cvt.u64.u32 %rd10942, %r4686; cvt.u64.u32 %rd10943, %r4685; bfi.b64 %rd10944, %rd10942, %rd10943, 32, 32; or.b64 %rd10945, %rd11311, %rd10944; mov.b64 {%r5435, %r5436}, %rd10945; mov.b64 {%r5437, %r4687}, %rd13065; bra.uni $L__BB1_2124; $L__BB1_2088: sub.f32 %f5099, %f5489, %f1504; abs.f32 %f1540, %f5099; setp.le.f32 %p2828, %f1540, 0f34000000; @%p2828 bra $L__BB1_2090; abs.f32 %f5100, %f5489; abs.f32 %f5101, %f1504; setp.gt.f32 %p2830, %f5101, %f5100; selp.f32 %f5102, %f5101, %f5100, %p2830; mul.f32 %f5103, %f5102, 0f34000000; setp.gtu.f32 %p2831, %f1540, %f5103; @%p2831 bra $L__BB1_2094; bra.uni $L__BB1_2090; $L__BB1_2071: setp.gt.u64 %p2810, %rd5139, %rd5142; @%p2810 bra $L__BB1_2073; bra.uni $L__BB1_2072; $L__BB1_2073: add.s64 %rd10885, %rd5140, %rd5142; ld.u8 %rs966, [%rd10885]; setp.eq.s16 %p2811, %rs966, 0; @%p2811 bra $L__BB1_2098; cvt.rn.f32.u64 %f5088, %rd5142; fma.rn.f32 %f1524, %f1518, %f5088, 0fBF000000; setp.gt.u64 %p2812, %rd5136, %rd5142; @%p2812 bra $L__BB1_2076; bra.uni $L__BB1_2075; $L__BB1_2076: shl.b64 %rd10886, %rd5142, 2; add.s64 %rd5143, %rd5141, %rd10886; ld.f32 %f1525, [%rd5143]; add.s64 %rd10887, %rd5142, 1; setp.gt.u64 %p2813, %rd5136, %rd10887; @%p2813 bra $L__BB1_2078; bra.uni $L__BB1_2077; $L__BB1_2078: ld.f32 %f1526, [%rd5143+4]; setp.gt.f32 %p2814, %f1526, %f1520; setp.gt.f32 %p2815, %f1525, %f1520; and.pred %p2816, %p2815, %p2814; @%p2816 bra $L__BB1_2098; setp.lt.f32 %p2817, %f1525, %f1519; setp.lt.f32 %p2818, %f1526, %f1519; and.pred %p2819, %p2817, %p2818; @%p2819 bra $L__BB1_2098; mul.f32 %f5089, %f5069, %f1524; mov.b32 %r4639, %f5089; mul.f32 %f1529, %f5070, %f1525; mov.b32 %r4640, %f1529; cvt.u64.u32 %rd10888, %r4640; cvt.u64.u32 %rd10889, %r4639; add.f32 %f5090, %f1518, %f1524; mul.f32 %f1527, %f5069, %f5090; mov.b32 %r1599, %f1527; mul.f32 %f5091, %f5070, %f1526; mov.b32 %r4641, %f5091; cvt.u64.u32 %rd10890, %r4641; cvt.u64.u32 %rd10891, %r1599; bfi.b64 %rd10892, %rd10890, %rd10891, 32, 32; bfi.b64 %rd10893, %rd10888, %rd10889, 32, 32; cvt.u32.u64 %r5430, %rd10893; mov.b32 %f5489, %r5430; sub.f32 %f1530, %f1527, %f5489; sub.f32 %f1531, %f5091, %f1529; sub.f32 %f5092, %f1504, %f5489; sub.f32 %f5093, %f1505, %f1529; mul.f32 %f5094, %f1531, %f5093; fma.rn.f32 %f1532, %f1530, %f5092, %f5094; mul.f32 %f5095, %f1531, %f1531; fma.rn.f32 %f5096, %f1530, %f1530, %f5095; add.f32 %f1533, %f5096, 0f00000000; setp.gtu.f32 %p2820, %f1532, 0f00000000; mov.b64 {%r4642, %r5431}, %rd10893; mov.b64 {%r4643, %r1602}, %rd10892; @%p2820 bra $L__BB1_2082; bra.uni $L__BB1_2081; $L__BB1_2082: setp.ltu.f32 %p2821, %f1532, %f1533; @%p2821 bra $L__BB1_2084; bra.uni $L__BB1_2083; $L__BB1_2084: setp.eq.f32 %p2822, %f1533, 0f00000000; @%p2822 bra $L__BB1_2097; div.rn.f32 %f5097, %f1532, %f1533; fma.rn.f32 %f5489, %f1530, %f5097, %f5489; mov.b32 %r5430, %f5489; fma.rn.f32 %f5490, %f1531, %f5097, %f1529; mov.b32 %r5431, %f5490; bra.uni $L__BB1_2086; $L__BB1_2081: mov.b32 %f5490, %r5431; bra.uni $L__BB1_2086; $L__BB1_2083: mov.b32 %f5490, %r1602; mov.f32 %f5489, %f1527; mov.u32 %r5430, %r1599; mov.u32 %r5431, %r1602; $L__BB1_2086: setp.eq.f32 %p2823, %f1504, %f5489; @%p2823 bra $L__BB1_2090; bra.uni $L__BB1_2087; $L__BB1_2090: setp.eq.f32 %p2833, %f5490, %f1505; mov.pred %p2832, -1; mov.pred %p2961, %p2832; @%p2833 bra $L__BB1_2094; setp.eq.f32 %p2835, %f1522, 0f7F800000; and.b32 %r4645, %r5431, 2147483647; mov.b32 %f5104, %r4645; setp.eq.f32 %p2836, %f5104, 0f7F800000; or.pred %p2837, %p2835, %p2836; mov.pred %p2961, 0; @%p2837 bra $L__BB1_2094; sub.f32 %f5105, %f5490, %f1505; abs.f32 %f1541, %f5105; setp.le.f32 %p2839, %f1541, 0f34000000; mov.pred %p2961, %p2832; @%p2839 bra $L__BB1_2094; abs.f32 %f5106, %f5490; abs.f32 %f5107, %f1505; setp.gt.f32 %p2840, %f5107, %f5106; selp.f32 %f5108, %f5107, %f5106, %p2840; mul.f32 %f5109, %f5108, 0f34000000; setp.le.f32 %p2961, %f1541, %f5109; bra.uni $L__BB1_2094; $L__BB1_2087: setp.eq.f32 %p2825, %f1521, 0f7F800000; and.b32 %r4644, %r5430, 2147483647; mov.b32 %f5098, %r4644; setp.eq.f32 %p2826, %f5098, 0f7F800000; or.pred %p2827, %p2825, %p2826; mov.pred %p2961, 0; @%p2827 bra $L__BB1_2094; bra.uni $L__BB1_2088; $L__BB1_2094: cvt.u64.u32 %rd10894, %r5431; cvt.u64.u32 %rd10895, %r5430; bfi.b64 %rd5144, %rd10894, %rd10895, 32, 32; mov.b64 {%r4646, %r4647}, %rd5144; selp.u64 %rd5145, 1, 0, %p2961; mov.b32 %f5110, %r4646; sub.f32 %f5111, %f5110, %f1504; mov.b32 %f5112, %r4647; sub.f32 %f5113, %f5112, %f1505; mul.f32 %f5114, %f5113, %f5113; fma.rn.f32 %f5115, %f5111, %f5111, %f5114; add.f32 %f1542, %f5115, 0f00000000; setp.geu.f32 %p2841, %f1542, %f5491; @%p2841 bra $L__BB1_2098; sqrt.rn.f32 %f5116, %f1542; setp.gtu.f32 %p2842, %f5116, %f6; mov.f32 %f5491, %f1542; @%p2842 bra $L__BB1_2098; mov.b64 {%r5434, %r4648}, %rd5145; mov.u32 %r5432, %r4646; mov.u32 %r5433, %r4647; mov.f32 %f5491, %f1542; $L__BB1_2098: add.s64 %rd5142, %rd5142, 1; setp.lt.u64 %p2843, %rd5142, %rd5138; @%p2843 bra $L__BB1_2071; st.local.u32 [%rd5135+8], %r5434; mov.b64 %rd10896, {%r5432, %r5433}; st.local.u64 [%rd5135], %rd10896; $L__BB1_2100: cvt.u64.u32 %rd10897, %r1591; cvt.u64.u32 %rd10898, %r1592; bfi.b64 %rd5147, %rd10898, %rd10897, 32, 32; ld.local.v4.u32 {%r4652, %r4653, %r4654, %r4655}, [%rd5135]; mov.b64 %rd5149, {%r4654, %r4655}; mov.b64 %rd5148, {%r4652, %r4653}; mov.b32 {%rs967, %rs968}, %r4654; and.b16 %rs969, %rs967, 255; setp.eq.s16 %p2844, %rs969, 2; cvt.u64.u16 %rd10899, %rs967; and.b64 %rd10900, %rd10899, 255; selp.b64 %rd10901, 2, %rd10900, %p2844; and.b64 %rd10902, %rd5149, 4294967040; or.b64 %rd10903, %rd10902, %rd10901; mov.b64 {%r4660, %r4661}, %rd10903; mov.b32 {%rs1042, %rs970}, %r4660; and.b16 %rs971, %rs1042, 255; setp.eq.s16 %p2845, %rs971, 2; mov.u32 %r5437, 2; mov.u32 %r5435, 0; mov.u32 %r5436, %r5435; @%p2845 bra $L__BB1_2124; ld.global.u8 %rs972, [%rd4641+64]; setp.eq.s16 %p2846, %rs972, 0; shr.u64 %rd10904, %rd5148, 32; cvt.u32.u64 %r4662, %rd10904; mov.b32 %f1544, %r4662; @%p2846 bra $L__BB1_2105; mov.b64 {%r4663, %r4664}, %rd5147; mov.b32 %f1546, %r4664; mov.b32 %f1545, %r4663; ld.global.u8 %rs205, [%rd4641+65]; setp.gt.f32 %p2848, %f1545, %f5061; setp.lt.f32 %p2849, %f1545, %f5063; or.pred %p2850, %p2849, %p2848; mov.pred %p2962, 0; @%p2850 bra $L__BB1_2104; setp.geu.f32 %p2851, %f1546, 0fFF7FFFFF; setp.leu.f32 %p2852, %f1546, 0f7F7FFFFF; and.pred %p2962, %p2852, %p2851; $L__BB1_2104: setp.ge.f32 %p2853, %f1505, %f1544; setp.le.f32 %p2854, %f1505, %f1544; setp.eq.s16 %p2855, %rs205, 0; selp.u32 %r4665, -1, 0, %p2853; selp.u32 %r4666, -1, 0, %p2854; selp.b32 %r4667, %r4666, %r4665, %p2855; and.b32 %r4668, %r4667, 1; setp.eq.b32 %p2856, %r4668, 1; and.pred %p2857, %p2856, %p2962; selp.u16 %rs1042, 1, 0, %p2857; $L__BB1_2105: cvt.u32.u64 %r4669, %rd5148; mov.b32 %f5117, %r4669; mul.f32 %f5118, %f1503, %f5117; mul.f32 %f5119, %f1502, %f1544; sub.f32 %f5120, %f5118, %f5119; mul.f32 %f5121, %f1503, %f1544; fma.rn.f32 %f5122, %f1502, %f5117, %f5121; add.f32 %f5123, %f1500, %f5120; mov.b32 %r4670, %f5123; add.f32 %f5124, %f1501, %f5122; mov.b32 %r4671, %f5124; cvt.u64.u32 %rd10905, %r4671; cvt.u64.u32 %rd10906, %r4670; cvt.u64.u16 %rd10907, %rs1042; bfi.b64 %rd10908, %rd10905, %rd10906, 32, 32; and.b64 %rd10909, %rd10907, 255; mov.b64 {%r5435, %r5436}, %rd10908; mov.b64 {%r5437, %r4672}, %rd10909; bra.uni $L__BB1_2124; $L__BB1_1923: setp.eq.s32 %p2614, %r5372, 0; @%p2614 bra $L__BB1_1936; setp.ne.s32 %p2615, %r5372, 1; @%p2615 bra $L__BB1_1949; add.s64 %rd4671, %rd12887, 1; or.b64 %rd10405, %rd4671, %rd4646; and.b64 %rd10406, %rd10405, -4294967296; setp.eq.s64 %p2616, %rd10406, 0; @%p2616 bra $L__BB1_1927; rem.u64 %rd12893, %rd4671, %rd4646; bra.uni $L__BB1_1928; $L__BB1_1936: setp.eq.s64 %p2623, %rd12887, 0; selp.b64 %rd4718, %rd4646, %rd12887, %p2623; add.s64 %rd10445, %rd4718, -1; setp.gt.u64 %p2624, %rd4646, %rd10445; @%p2624 bra $L__BB1_1938; bra.uni $L__BB1_1937; $L__BB1_1938: shl.b64 %rd10446, %rd4718, 3; add.s64 %rd10447, %rd4647, %rd10446; ld.u32 %rd10448, [%rd10447+-8]; ld.u32 %rd10449, [%rd10447+-4]; bfi.b64 %rd4719, %rd10449, %rd10448, 32, 32; or.b64 %rd10450, %rd4718, %rd4646; and.b64 %rd10451, %rd10450, -4294967296; setp.eq.s64 %p2625, %rd10451, 0; @%p2625 bra $L__BB1_1940; rem.u64 %rd12910, %rd4718, %rd4646; bra.uni $L__BB1_1941; $L__BB1_2054: ld.u32 %r4602, [%rd4784+76]; cvt.u64.u32 %rd10815, %r4602; setp.le.u64 %p2790, %rd4775, %rd10815; mul.wide.u32 %rd10816, %r4602, 12; add.s64 %rd10817, %rd4776, %rd10816; setp.eq.s64 %p2791, %rd10817, 0; or.pred %p2792, %p2790, %p2791; selp.b32 %r1485, %r1485, %r5391, %p2792; selp.b32 %r1484, %r1484, %r5390, %p2792; selp.b32 %r1483, %r1483, %r5389, %p2792; selp.b32 %r1487, %r1487, %r5404, %p2792; selp.b32 %r1488, %r1488, %r1537, %p2792; $L__BB1_1959: mov.b32 %f1435, %r1488; $L__BB1_1960: mov.u32 %r1489, %r1490; setp.eq.s32 %p2634, %r1489, 0; @%p2634 bra $L__BB1_2061; cvt.u64.u32 %rd10507, %r1489; add.s64 %rd10508, %rd10507, -1; cvt.u32.u64 %r1490, %rd10508; st.local.u32 [%rd4609+512], %r1490; mul.wide.u32 %rd10509, %r1489, 8; add.s64 %rd10510, %rd4609, %rd10509; ld.local.u32 %rd4782, [%rd10510+-4]; ld.local.u32 %rd10511, [%rd10510+-8]; shl.b64 %rd10512, %rd10511, 32; or.b64 %rd4781, %rd10512, 1; mov.b64 {%r4458, %r4459}, %rd4782; mov.b32 %f4865, %r4458; neg.f32 %f4866, %f4865; setp.le.f32 %p2635, %f1435, %f4866; @%p2635 bra $L__BB1_1960; mov.b64 {%r4460, %r4461}, %rd4781; cvt.u64.u32 %rd4783, %r4461; setp.gt.u64 %p2636, %rd4772, %rd4783; @%p2636 bra $L__BB1_1964; bra.uni $L__BB1_1963; $L__BB1_1964: mul.lo.s64 %rd10513, %rd4783, 96; add.s64 %rd4784, %rd4774, %rd10513; ld.u8 %rs907, [%rd4784+88]; and.b16 %rs908, %rs907, 1; setp.eq.b16 %p2638, %rs908, 1; mov.pred %p2960, 0; xor.pred %p2639, %p2638, %p2960; not.pred %p2640, %p2639; @%p2640 bra $L__BB1_1966; ld.v4.u32 {%r4462, %r4463, %r4464, %r4465}, [%rd4784+64]; cvt.u64.u32 %rd10514, %r4462; setp.gt.u64 %p2642, %rd4775, %rd10514; mul.wide.u32 %rd10515, %r4462, 12; add.s64 %rd10516, %rd4776, %rd10515; selp.b64 %rd10517, %rd10516, 0, %p2642; setp.eq.s64 %p2643, %rd10517, 0; add.s64 %rd10518, %rd10517, 8; selp.b64 %rd12931, 0, %rd10518, %p2643; cvt.u64.u32 %rd10519, %r4463; setp.gt.u64 %p2644, %rd4775, %rd10519; mul.wide.u32 %rd10520, %r4463, 12; add.s64 %rd10521, %rd4776, %rd10520; selp.b64 %rd10522, %rd10521, 0, %p2644; setp.eq.s64 %p2645, %rd10522, 0; add.s64 %rd10523, %rd10522, 8; selp.b64 %rd12930, 0, %rd10523, %p2645; ld.u32 %r4469, [%rd4784+72]; cvt.u64.u32 %rd10524, %r4469; setp.gt.u64 %p2646, %rd4775, %rd10524; mul.wide.u32 %rd10525, %r4469, 12; add.s64 %rd10526, %rd4776, %rd10525; selp.b64 %rd10527, %rd10526, 0, %p2646; setp.eq.s64 %p2647, %rd10527, 0; add.s64 %rd10528, %rd10527, 8; selp.b64 %rd12929, 0, %rd10528, %p2647; cvt.u64.u32 %rd10529, %r4465; setp.gt.u64 %p2648, %rd4775, %rd10529; mul.wide.u32 %rd10530, %r4465, 12; add.s64 %rd10531, %rd4776, %rd10530; selp.b64 %rd10532, %rd10531, 0, %p2648; setp.eq.s64 %p2649, %rd10532, 0; add.s64 %rd10533, %rd10532, 8; selp.b64 %rd12928, 0, %rd10533, %p2649; mov.pred %p2960, -1; $L__BB1_1966: ld.v4.f32 {%f4867, %f4868, %f4869, %f4870}, [%rd4784]; sub.f32 %f4875, %f4867, %f1433; sub.f32 %f4876, %f4868, %f1433; sub.f32 %f4877, %f4869, %f1433; sub.f32 %f4878, %f4870, %f1433; ld.v4.f32 {%f4879, %f4880, %f4881, %f4882}, [%rd4784+16]; sub.f32 %f4887, %f4879, %f1434; sub.f32 %f4888, %f4880, %f1434; sub.f32 %f4889, %f4881, %f1434; sub.f32 %f4890, %f4882, %f1434; ld.v4.f32 {%f4891, %f4892, %f4893, %f4894}, [%rd4784+32]; sub.f32 %f4899, %f1433, %f4891; sub.f32 %f4900, %f1433, %f4892; sub.f32 %f4901, %f1433, %f4893; sub.f32 %f4902, %f1433, %f4894; ld.v4.f32 {%f4903, %f4904, %f4905, %f4906}, [%rd4784+48]; sub.f32 %f4911, %f1434, %f4903; sub.f32 %f4912, %f1434, %f4904; sub.f32 %f4913, %f1434, %f4905; sub.f32 %f4914, %f1434, %f4906; setp.ge.f32 %p2650, %f4875, %f4899; selp.f32 %f4915, %f4875, %f4899, %p2650; setp.ge.f32 %p2651, %f4876, %f4900; selp.f32 %f4916, %f4876, %f4900, %p2651; setp.ge.f32 %p2652, %f4877, %f4901; selp.f32 %f4917, %f4877, %f4901, %p2652; setp.ge.f32 %p2653, %f4878, %f4902; selp.f32 %f4918, %f4878, %f4902, %p2653; setp.ge.f32 %p2654, %f4887, %f4911; selp.f32 %f4919, %f4887, %f4911, %p2654; setp.ge.f32 %p2655, %f4888, %f4912; selp.f32 %f4920, %f4888, %f4912, %p2655; setp.ge.f32 %p2656, %f4889, %f4913; selp.f32 %f4921, %f4889, %f4913, %p2656; setp.ge.f32 %p2657, %f4890, %f4914; selp.f32 %f4922, %f4890, %f4914, %p2657; setp.ge.f32 %p2658, %f4915, 0f00000000; selp.f32 %f4923, %f4915, 0f00000000, %p2658; setp.ge.f32 %p2659, %f4916, 0f00000000; selp.f32 %f4924, %f4916, 0f00000000, %p2659; setp.ge.f32 %p2660, %f4917, 0f00000000; selp.f32 %f4925, %f4917, 0f00000000, %p2660; setp.ge.f32 %p2661, %f4918, 0f00000000; selp.f32 %f4926, %f4918, 0f00000000, %p2661; mov.b32 %r4470, %f4923; mov.b32 %r4471, %f4924; mov.b32 %r4472, %f4925; mov.b32 %r4473, %f4926; cvt.u64.u32 %rd10534, %r4473; cvt.u64.u32 %rd10535, %r4471; cvt.u64.u32 %rd10536, %r4470; cvt.u64.u32 %rd10537, %r4472; bfi.b64 %rd10538, %rd10534, %rd10537, 32, 32; bfi.b64 %rd10539, %rd10535, %rd10536, 32, 32; setp.ge.f32 %p2662, %f4919, 0f00000000; selp.f32 %f4927, %f4919, 0f00000000, %p2662; setp.ge.f32 %p2663, %f4920, 0f00000000; selp.f32 %f4928, %f4920, 0f00000000, %p2663; setp.ge.f32 %p2664, %f4921, 0f00000000; selp.f32 %f4929, %f4921, 0f00000000, %p2664; setp.ge.f32 %p2665, %f4922, 0f00000000; selp.f32 %f4930, %f4922, 0f00000000, %p2665; mov.b32 %r4474, %f4927; mov.b32 %r4475, %f4928; mov.b32 %r4476, %f4929; mov.b32 %r4477, %f4930; cvt.u64.u32 %rd10540, %r4477; cvt.u64.u32 %rd10541, %r4475; cvt.u64.u32 %rd10542, %r4474; cvt.u64.u32 %rd10543, %r4476; bfi.b64 %rd10544, %rd10540, %rd10543, 32, 32; bfi.b64 %rd10545, %rd10541, %rd10542, 32, 32; mov.b64 {%r4478, %r4479}, %rd10539; mov.b64 {%r4480, %r4481}, %rd10538; cvt.u64.u32 %rd10546, %r4481; cvt.u64.u32 %rd10547, %r4479; cvt.u64.u32 %rd10548, %r4480; bfi.b64 %rd10549, %rd10546, %rd10548, 32, 32; mov.b64 {%r4482, %r4483}, %rd10549; bfi.b64 %rd10550, %rd10547, %rd10536, 32, 32; mov.b64 {%r4484, %r4485}, %rd10550; mov.b32 %f4931, %r4484; mov.b32 %f4932, %r4485; mov.b32 %f4933, %r4482; mov.b32 %f4934, %r4483; mov.b32 %f4935, %r4478; mov.b32 %f4936, %r4479; mov.b32 %f4937, %r4480; mov.b32 %f4938, %r4481; mov.b64 {%r4486, %r4487}, %rd10545; mov.b64 {%r4488, %r4489}, %rd10544; cvt.u64.u32 %rd10551, %r4489; cvt.u64.u32 %rd10552, %r4487; cvt.u64.u32 %rd10553, %r4488; bfi.b64 %rd10554, %rd10551, %rd10553, 32, 32; mov.b64 {%r4490, %r4491}, %rd10554; bfi.b64 %rd10555, %rd10552, %rd10542, 32, 32; mov.b64 {%r4492, %r4493}, %rd10555; mov.b32 %f4939, %r4492; mov.b32 %f4940, %r4493; mov.b32 %f4941, %r4490; mov.b32 %f4942, %r4491; mov.b32 %f4943, %r4486; mov.b32 %f4944, %r4487; mov.b32 %f4945, %r4488; mov.b32 %f4946, %r4489; mul.f32 %f4947, %f4943, %f4939; mul.f32 %f4948, %f4944, %f4940; mul.f32 %f4949, %f4945, %f4941; mul.f32 %f4950, %f4946, %f4942; fma.rn.f32 %f4951, %f4935, %f4931, %f4947; fma.rn.f32 %f4952, %f4936, %f4932, %f4948; fma.rn.f32 %f4953, %f4937, %f4933, %f4949; fma.rn.f32 %f4954, %f4938, %f4934, %f4950; add.f32 %f4955, %f4951, 0f00000000; add.f32 %f4956, %f4952, 0f00000000; add.f32 %f4957, %f4953, 0f00000000; add.f32 %f4958, %f4954, 0f00000000; sqrt.rn.f32 %f4959, %f4955; sqrt.rn.f32 %f4960, %f4956; sqrt.rn.f32 %f4961, %f4957; sqrt.rn.f32 %f4962, %f4958; mov.b32 %r4494, %f4959; mov.b32 %r4495, %f4960; mov.b32 %r4496, %f4961; mov.b32 %r4497, %f4962; cvt.u64.u32 %rd10556, %r4497; cvt.u64.u32 %rd10557, %r4495; cvt.u64.u32 %rd10558, %r4494; cvt.u64.u32 %rd10559, %r4496; bfi.b64 %rd13037, %rd10556, %rd10559, 32, 32; mov.b64 {%r4498, %r4499}, %rd13037; bfi.b64 %rd13036, %rd10557, %rd10558, 32, 32; mov.b64 {%r4500, %r4501}, %rd13036; mov.b32 %f4963, %r4500; mov.b32 %f4964, %r4501; mov.b32 %f4965, %r4498; mov.b32 %f4966, %r4499; setp.lt.f32 %p2666, %f4963, %f1435; setp.lt.f32 %p2667, %f4964, %f1435; setp.lt.f32 %p2668, %f4965, %f1435; setp.lt.f32 %p2669, %f4966, %f1435; selp.u32 %r4502, 1, 0, %p2666; selp.u32 %r4503, -1, 0, %p2667; bfi.b32 %r4504, %r4503, %r4502, 8, 1; selp.u32 %r4505, -1, 0, %p2668; bfi.b32 %r4506, %r4505, %r4504, 16, 1; selp.u32 %r4507, -1, 0, %p2669; bfi.b32 %r4508, %r4507, %r4506, 24, 1; cvt.u64.u32 %rd10560, %r4508; mov.b64 {%r4509, %r4510}, %rd10560; mov.b32 {%rs909, %rs910}, %r4509; and.b16 %rs911, %rs909, 1; shr.u16 %rs912, %rs909, 7; and.b16 %rs913, %rs912, 2; or.b16 %rs914, %rs913, %rs911; shl.b16 %rs915, %rs910, 2; and.b16 %rs916, %rs915, 4; or.b16 %rs917, %rs914, %rs916; shr.u16 %rs918, %rs910, 5; and.b16 %rs919, %rs918, 8; or.b16 %rs920, %rs917, %rs919; cvt.u64.u16 %rd4795, %rs920; @%p2960 bra $L__BB1_1968; bra.uni $L__BB1_1967; $L__BB1_1968: mov.u64 %rd10561, 1; st.local.v2.u64 [%rd8], {%rd12931, %rd12930}; st.local.v2.u64 [%rd8+16], {%rd12929, %rd12928}; mov.f32 %f4967, 0f00000000; st.local.v4.f32 [%rd24], {%f4967, %f4967, %f4967, %f4967}; mov.u32 %r4521, 4; st.local.u32 [%rd4605+16], %r4521; st.local.u32 [%rd4605+52], %r4521; st.local.u32 [%rd4605+88], %r4521; st.local.u32 [%rd4605+124], %r4521; mov.u64 %rd4800, %rd10561; $L__BB1_1969: add.s64 %rd10562, %rd4800, -1; cvt.u32.u64 %r4522, %rd10562; shl.b64 %rd10564, %rd10561, %r4522; and.b64 %rd10565, %rd10564, %rd4795; setp.eq.s64 %p2670, %rd10565, 0; @%p2670 bra $L__BB1_2022; shl.b64 %rd10566, %rd4800, 3; add.s64 %rd10567, %rd8, %rd10566; ld.local.u64 %rd4801, [%rd10567+-8]; setp.eq.s64 %p2671, %rd4801, 0; @%p2671 bra $L__BB1_2022; ld.u32 %r1491, [%rd4801]; cvt.u64.u32 %rd4802, %r1491; ld.global.u64 %rd10568, [%rd4641+112]; setp.gt.u64 %p2672, %rd10568, %rd4802; @%p2672 bra $L__BB1_1973; bra.uni $L__BB1_1972; $L__BB1_1973: ld.global.u64 %rd10569, [%rd4641+104]; mul.lo.s64 %rd10570, %rd4802, 12; add.s64 %rd4803, %rd10569, %rd10570; ld.u32 %rd4804, [%rd4803+8]; ld.u32 %rd4805, [%rd4803]; ld.global.u64 %rd4806, [%rd4641+96]; setp.gt.u64 %p2673, %rd4806, %rd4805; @%p2673 bra $L__BB1_1975; bra.uni $L__BB1_1974; $L__BB1_1975: ld.global.u64 %rd4807, [%rd4641+88]; shl.b64 %rd10571, %rd4805, 3; add.s64 %rd10572, %rd4807, %rd10571; ld.u32 %rd10573, [%rd10572]; ld.u32 %rd10574, [%rd10572+4]; bfi.b64 %rd4808, %rd10574, %rd10573, 32, 32; ld.u32 %rd4809, [%rd4803+4]; setp.gt.u64 %p2674, %rd4806, %rd4809; @%p2674 bra $L__BB1_1977; bra.uni $L__BB1_1976; $L__BB1_1977: setp.gt.u64 %p2675, %rd4806, %rd4804; @%p2675 bra $L__BB1_1979; bra.uni $L__BB1_1978; $L__BB1_1979: shl.b64 %rd10575, %rd4809, 3; add.s64 %rd10576, %rd4807, %rd10575; shl.b64 %rd10577, %rd4804, 3; add.s64 %rd10578, %rd4807, %rd10577; cvt.u32.u64 %r4523, %rd4808; mov.b32 %f1436, %r4523; shr.u64 %rd10579, %rd4808, 32; cvt.u32.u64 %r4524, %rd10579; mov.b32 %f1437, %r4524; ld.u32 %rd10580, [%rd10576]; ld.u32 %rd10581, [%rd10576+4]; bfi.b64 %rd4810, %rd10581, %rd10580, 32, 32; cvt.u32.u64 %r4525, %rd4810; shr.u64 %rd10582, %rd4810, 32; cvt.u32.u64 %r4526, %rd10582; mov.b32 %f1438, %r4525; sub.f32 %f1439, %f1438, %f1436; mov.b32 %f5483, %r4526; sub.f32 %f1441, %f5483, %f1437; ld.u32 %rd10583, [%rd10578]; ld.u32 %rd10584, [%rd10578+4]; bfi.b64 %rd4811, %rd10584, %rd10583, 32, 32; cvt.u32.u64 %r4527, %rd4811; shr.u64 %rd10585, %rd4811, 32; cvt.u32.u64 %r4528, %rd10585; mov.b32 %f1442, %r4527; sub.f32 %f1443, %f1442, %f1436; mov.b32 %f1444, %r4528; sub.f32 %f1445, %f1444, %f1437; sub.f32 %f1446, %f1433, %f1436; sub.f32 %f1447, %f1434, %f1437; mul.f32 %f4968, %f1447, %f1441; fma.rn.f32 %f1448, %f1446, %f1439, %f4968; mul.f32 %f4969, %f1447, %f1445; fma.rn.f32 %f1449, %f1446, %f1443, %f4969; setp.le.f32 %p2676, %f1448, 0f00000000; setp.le.f32 %p2677, %f1449, 0f00000000; and.pred %p2678, %p2676, %p2677; @%p2678 bra $L__BB1_2017; bra.uni $L__BB1_1980; $L__BB1_2017: add.u64 %rd13022, %SP, 552; add.u64 %rd13028, %SP, 0; st.local.u64 [%rd4613], %rd4808; mov.u64 %rd13033, 2; mov.u64 %rd13019, %rd4626; mov.u64 %rd13020, %rd4625; mov.u64 %rd13021, %rd4625; mov.u64 %rd13023, %rd4625; mov.u64 %rd13024, %rd4625; mov.u64 %rd13025, %rd13022; mov.u64 %rd13026, %rd4613; mov.u64 %rd13027, %rd4613; mov.u64 %rd13029, %rd4613; mov.u64 %rd13030, %rd4613; mov.u64 %rd13031, %rd13028; mov.u64 %rd13032, %rd4614; $L__BB1_2018: setp.eq.s64 %p2731, %rd13033, 0; mov.u64 %rd13034, 1; @%p2731 bra $L__BB1_2020; add.s64 %rd13033, %rd13033, -1; add.s64 %rd10730, %rd13020, 8; setp.eq.s64 %p2732, %rd13023, %rd13019; selp.b64 %rd10731, %rd10730, %rd13023, %p2732; add.s64 %rd10732, %rd13021, 8; selp.b64 %rd10733, %rd10732, %rd13024, %p2732; add.s64 %rd10734, %rd13022, 8; selp.b64 %rd10735, %rd10734, %rd13025, %p2732; mov.u64 %rd13034, 0; setp.eq.s64 %p2733, %rd13033, 0; add.s64 %rd10736, %rd10731, 4; add.s64 %rd10737, %rd10733, 4; add.s64 %rd10738, %rd10735, 4; selp.b64 %rd5037, %rd10731, %rd10736, %p2733; selp.b64 %rd13024, %rd10733, %rd10737, %p2733; selp.b64 %rd13025, %rd10735, %rd10738, %p2733; selp.b64 %rd13020, %rd10730, %rd13020, %p2732; selp.b64 %rd13021, %rd10732, %rd13021, %p2732; selp.b64 %rd13022, %rd10734, %rd13022, %p2732; add.s64 %rd10739, %rd13023, 8; selp.b64 %rd13019, %rd10739, %rd13019, %p2732; add.s64 %rd10740, %rd13029, 8; setp.eq.s64 %p2734, %rd13026, %rd13032; selp.b64 %rd10741, %rd10740, %rd13026, %p2734; add.s64 %rd10742, %rd13030, 8; selp.b64 %rd10743, %rd10742, %rd13027, %p2734; add.s64 %rd10744, %rd13031, 8; selp.b64 %rd10745, %rd10744, %rd13028, %p2734; selp.b64 %rd13029, %rd10740, %rd13029, %p2734; selp.b64 %rd13030, %rd10742, %rd13030, %p2734; selp.b64 %rd13031, %rd10744, %rd13031, %p2734; add.s64 %rd10746, %rd13026, 8; selp.b64 %rd13032, %rd10746, %rd13032, %p2734; add.s64 %rd10747, %rd10741, 4; add.s64 %rd10748, %rd10743, 4; add.s64 %rd10749, %rd10745, 4; selp.b64 %rd13026, %rd10741, %rd10747, %p2733; selp.b64 %rd13027, %rd10743, %rd10748, %p2733; selp.b64 %rd13028, %rd10745, %rd10749, %p2733; ld.local.f32 %f5035, [%rd10743]; ld.local.f32 %f5036, [%rd10733]; setp.eq.f32 %p2735, %f5036, %f5035; mov.u64 %rd13023, %rd5037; @%p2735 bra $L__BB1_2018; $L__BB1_2020: mov.u64 %rd11288, 0; or.b64 %rd10751, %rd11288, %rd4808; mov.b64 {%r4570, %r4571}, %rd10751; mov.b64 {%r4572, %r4573}, %rd13034; cvt.u32.u64 %r4575, %rd11288; or.b32 %r5386, %r4575, %r4523; mov.u32 %r5387, 0; mov.b32 %f5487, %r4571; mov.b32 {%rs1041, %rs939}, %r4572; mov.u32 %r5388, %r5387; bra.uni $L__BB1_2021; $L__BB1_1980: sub.f32 %f1450, %f1433, %f1438; sub.f32 %f1451, %f1434, %f5483; mul.f32 %f4970, %f1441, %f1451; fma.rn.f32 %f1452, %f1439, %f1450, %f4970; mul.f32 %f4971, %f1451, %f1445; fma.rn.f32 %f1453, %f1450, %f1443, %f4971; setp.ge.f32 %p2679, %f1452, 0f00000000; setp.le.f32 %p2680, %f1453, %f1452; and.pred %p2681, %p2679, %p2680; @%p2681 bra $L__BB1_2013; bra.uni $L__BB1_1981; $L__BB1_2013: add.u64 %rd13006, %SP, 552; add.u64 %rd13012, %SP, 0; st.local.u64 [%rd4613], %rd4810; mov.u64 %rd13017, 2; mov.u64 %rd13003, %rd4626; mov.u64 %rd13004, %rd4625; mov.u64 %rd13005, %rd4625; mov.u64 %rd13007, %rd4625; mov.u64 %rd13008, %rd4625; mov.u64 %rd13009, %rd13006; mov.u64 %rd13010, %rd4613; mov.u64 %rd13011, %rd4613; mov.u64 %rd13013, %rd4613; mov.u64 %rd13014, %rd4613; mov.u64 %rd13015, %rd13012; mov.u64 %rd13016, %rd4616; $L__BB1_2014: setp.eq.s64 %p2726, %rd13017, 0; mov.u64 %rd13018, 1; @%p2726 bra $L__BB1_2016; add.s64 %rd13017, %rd13017, -1; add.s64 %rd10703, %rd13004, 8; setp.eq.s64 %p2727, %rd13007, %rd13003; selp.b64 %rd10704, %rd10703, %rd13007, %p2727; add.s64 %rd10705, %rd13005, 8; selp.b64 %rd10706, %rd10705, %rd13008, %p2727; add.s64 %rd10707, %rd13006, 8; selp.b64 %rd10708, %rd10707, %rd13009, %p2727; mov.u64 %rd13018, 0; setp.eq.s64 %p2728, %rd13017, 0; add.s64 %rd10709, %rd10704, 4; add.s64 %rd10710, %rd10706, 4; add.s64 %rd10711, %rd10708, 4; selp.b64 %rd4999, %rd10704, %rd10709, %p2728; selp.b64 %rd13008, %rd10706, %rd10710, %p2728; selp.b64 %rd13009, %rd10708, %rd10711, %p2728; selp.b64 %rd13004, %rd10703, %rd13004, %p2727; selp.b64 %rd13005, %rd10705, %rd13005, %p2727; selp.b64 %rd13006, %rd10707, %rd13006, %p2727; add.s64 %rd10712, %rd13007, 8; selp.b64 %rd13003, %rd10712, %rd13003, %p2727; add.s64 %rd10713, %rd13013, 8; setp.eq.s64 %p2729, %rd13010, %rd13016; selp.b64 %rd10714, %rd10713, %rd13010, %p2729; add.s64 %rd10715, %rd13014, 8; selp.b64 %rd10716, %rd10715, %rd13011, %p2729; add.s64 %rd10717, %rd13015, 8; selp.b64 %rd10718, %rd10717, %rd13012, %p2729; selp.b64 %rd13013, %rd10713, %rd13013, %p2729; selp.b64 %rd13014, %rd10715, %rd13014, %p2729; selp.b64 %rd13015, %rd10717, %rd13015, %p2729; add.s64 %rd10719, %rd13010, 8; selp.b64 %rd13016, %rd10719, %rd13016, %p2729; add.s64 %rd10720, %rd10714, 4; add.s64 %rd10721, %rd10716, 4; add.s64 %rd10722, %rd10718, 4; selp.b64 %rd13010, %rd10714, %rd10720, %p2728; selp.b64 %rd13011, %rd10716, %rd10721, %p2728; selp.b64 %rd13012, %rd10718, %rd10722, %p2728; ld.local.f32 %f5033, [%rd10716]; ld.local.f32 %f5034, [%rd10706]; setp.eq.f32 %p2730, %f5034, %f5033; mov.u64 %rd13007, %rd4999; @%p2730 bra $L__BB1_2014; $L__BB1_2016: mov.u64 %rd11287, 0; or.b64 %rd10724, %rd11287, %rd4810; mov.b64 {%r4562, %r4563}, %rd10724; mov.b64 {%r4564, %r4565}, %rd13018; cvt.u32.u64 %r4567, %rd11287; or.b32 %r5386, %r4567, %r4525; mov.u32 %r5387, 0; mov.b32 %f5487, %r4563; mov.u32 %r5388, 1; mov.b32 {%rs1041, %rs935}, %r4564; bra.uni $L__BB1_2021; $L__BB1_1981: sub.f32 %f1454, %f1433, %f1442; sub.f32 %f1455, %f1434, %f1444; mul.f32 %f4972, %f1441, %f1455; fma.rn.f32 %f1456, %f1439, %f1454, %f4972; mul.f32 %f4973, %f1445, %f1455; fma.rn.f32 %f1457, %f1443, %f1454, %f4973; setp.ge.f32 %p2682, %f1457, 0f00000000; setp.le.f32 %p2683, %f1456, %f1457; and.pred %p2684, %p2683, %p2682; @%p2684 bra $L__BB1_2009; bra.uni $L__BB1_1982; $L__BB1_2009: add.u64 %rd12990, %SP, 552; add.u64 %rd12996, %SP, 0; st.local.u64 [%rd4613], %rd4811; mov.u64 %rd13001, 2; mov.u64 %rd12987, %rd4626; mov.u64 %rd12988, %rd4625; mov.u64 %rd12989, %rd4625; mov.u64 %rd12991, %rd4625; mov.u64 %rd12992, %rd4625; mov.u64 %rd12993, %rd12990; mov.u64 %rd12994, %rd4613; mov.u64 %rd12995, %rd4613; mov.u64 %rd12997, %rd4613; mov.u64 %rd12998, %rd4613; mov.u64 %rd12999, %rd12996; mov.u64 %rd13000, %rd4618; $L__BB1_2010: setp.eq.s64 %p2721, %rd13001, 0; mov.u64 %rd13002, 1; @%p2721 bra $L__BB1_2012; add.s64 %rd13001, %rd13001, -1; add.s64 %rd10676, %rd12988, 8; setp.eq.s64 %p2722, %rd12991, %rd12987; selp.b64 %rd10677, %rd10676, %rd12991, %p2722; add.s64 %rd10678, %rd12989, 8; selp.b64 %rd10679, %rd10678, %rd12992, %p2722; add.s64 %rd10680, %rd12990, 8; selp.b64 %rd10681, %rd10680, %rd12993, %p2722; mov.u64 %rd13002, 0; setp.eq.s64 %p2723, %rd13001, 0; add.s64 %rd10682, %rd10677, 4; add.s64 %rd10683, %rd10679, 4; add.s64 %rd10684, %rd10681, 4; selp.b64 %rd4961, %rd10677, %rd10682, %p2723; selp.b64 %rd12992, %rd10679, %rd10683, %p2723; selp.b64 %rd12993, %rd10681, %rd10684, %p2723; selp.b64 %rd12988, %rd10676, %rd12988, %p2722; selp.b64 %rd12989, %rd10678, %rd12989, %p2722; selp.b64 %rd12990, %rd10680, %rd12990, %p2722; add.s64 %rd10685, %rd12991, 8; selp.b64 %rd12987, %rd10685, %rd12987, %p2722; add.s64 %rd10686, %rd12997, 8; setp.eq.s64 %p2724, %rd12994, %rd13000; selp.b64 %rd10687, %rd10686, %rd12994, %p2724; add.s64 %rd10688, %rd12998, 8; selp.b64 %rd10689, %rd10688, %rd12995, %p2724; add.s64 %rd10690, %rd12999, 8; selp.b64 %rd10691, %rd10690, %rd12996, %p2724; selp.b64 %rd12997, %rd10686, %rd12997, %p2724; selp.b64 %rd12998, %rd10688, %rd12998, %p2724; selp.b64 %rd12999, %rd10690, %rd12999, %p2724; add.s64 %rd10692, %rd12994, 8; selp.b64 %rd13000, %rd10692, %rd13000, %p2724; add.s64 %rd10693, %rd10687, 4; add.s64 %rd10694, %rd10689, 4; add.s64 %rd10695, %rd10691, 4; selp.b64 %rd12994, %rd10687, %rd10693, %p2723; selp.b64 %rd12995, %rd10689, %rd10694, %p2723; selp.b64 %rd12996, %rd10691, %rd10695, %p2723; ld.local.f32 %f5031, [%rd10689]; ld.local.f32 %f5032, [%rd10679]; setp.eq.f32 %p2725, %f5032, %f5031; mov.u64 %rd12991, %rd4961; @%p2725 bra $L__BB1_2010; $L__BB1_2012: mov.u64 %rd11286, 0; or.b64 %rd10697, %rd11286, %rd4811; mov.b64 {%r4554, %r4555}, %rd10697; mov.b64 {%r4556, %r4557}, %rd13002; cvt.u32.u64 %r4559, %rd11286; or.b32 %r5386, %r4559, %r4527; mov.u32 %r5387, 0; mov.b32 %f5487, %r4555; mov.b32 {%rs1041, %rs931}, %r4556; mov.u32 %r5388, 2; bra.uni $L__BB1_2021; $L__BB1_1982: sub.f32 %f1458, %f1442, %f1438; sub.f32 %f1459, %f1444, %f5483; mul.f32 %f4974, %f1441, %f1443; mul.f32 %f4975, %f1439, %f1445; sub.f32 %f1460, %f4975, %f4974; mul.f32 %f4976, %f1446, %f1441; mul.f32 %f4977, %f1447, %f1439; sub.f32 %f4978, %f4977, %f4976; mul.f32 %f4979, %f4978, %f1460; setp.lt.f32 %p2685, %f4979, 0f00000000; setp.ge.f32 %p2686, %f1448, 0f00000000; and.pred %p2687, %p2686, %p2685; setp.le.f32 %p2688, %f1452, 0f00000000; and.pred %p2689, %p2688, %p2687; mov.u16 %rs1040, 0; @%p2689 bra $L__BB1_1985; mul.f32 %f4980, %f1443, %f1455; mul.f32 %f4981, %f1454, %f1445; sub.f32 %f4982, %f4980, %f4981; mul.f32 %f4983, %f1460, %f4982; setp.gt.f32 %p2690, %f4983, 0f80000000; setp.ge.f32 %p2691, %f1449, 0f00000000; and.pred %p2692, %p2691, %p2690; setp.le.f32 %p2693, %f1457, 0f00000000; and.pred %p2694, %p2693, %p2692; mov.u16 %rs1040, 1; @%p2694 bra $L__BB1_1985; mul.f32 %f4984, %f1458, %f1451; mul.f32 %f4985, %f1450, %f1459; sub.f32 %f4986, %f4984, %f4985; mul.f32 %f4987, %f1460, %f4986; setp.lt.f32 %p2695, %f4987, 0f00000000; sub.f32 %f4988, %f1453, %f1452; setp.ge.f32 %p2696, %f4988, 0f00000000; and.pred %p2697, %p2696, %p2695; sub.f32 %f4989, %f1456, %f1457; setp.ge.f32 %p2698, %f4989, 0f00000000; and.pred %p2699, %p2698, %p2697; selp.b16 %rs1040, 2, 3, %p2699; $L__BB1_1985: mul.f32 %f4990, %f1441, %f1441; fma.rn.f32 %f4991, %f1439, %f1439, %f4990; add.f32 %f1461, %f4991, 0f00000000; mul.f32 %f4992, %f1445, %f1445; fma.rn.f32 %f4993, %f1443, %f1443, %f4992; add.f32 %f1462, %f4993, 0f00000000; mul.f32 %f4994, %f1459, %f1459; fma.rn.f32 %f4995, %f1458, %f1458, %f4994; add.f32 %f1463, %f4995, 0f00000000; setp.eq.s16 %p2700, %rs1040, 1; @%p2700 bra $L__BB1_2000; setp.eq.s16 %p2701, %rs1040, 2; @%p2701 bra $L__BB1_1996; setp.ne.s16 %p2702, %rs1040, 3; @%p2702 bra $L__BB1_2004; sub.f32 %f4996, %f1448, %f1452; div.rn.f32 %f1464, %f1448, %f4996; sub.f32 %f4997, %f1449, %f1457; div.rn.f32 %f1465, %f1449, %f4997; sub.f32 %f4998, %f1453, %f1452; add.f32 %f4999, %f1456, %f4998; sub.f32 %f5000, %f4999, %f1457; div.rn.f32 %f5485, %f4998, %f5000; mul.f32 %f5001, %f1447, %f1447; fma.rn.f32 %f5002, %f1446, %f1446, %f5001; add.f32 %f5003, %f5002, 0f00000000; mul.f32 %f5004, %f1461, %f1464; mul.f32 %f5005, %f1464, %f5004; sub.f32 %f1467, %f5003, %f5005; mul.f32 %f5006, %f1462, %f5485; mul.f32 %f5007, %f5485, %f5006; sub.f32 %f1468, %f5003, %f5007; mul.f32 %f5008, %f1451, %f1451; fma.rn.f32 %f5009, %f1450, %f1450, %f5008; add.f32 %f5010, %f5009, 0f00000000; mul.f32 %f5011, %f1463, %f1465; mul.f32 %f5012, %f1465, %f5011; sub.f32 %f1469, %f5010, %f5012; setp.lt.f32 %p2703, %f1467, %f1468; @%p2703 bra $L__BB1_1992; bra.uni $L__BB1_1989; $L__BB1_1992: setp.lt.f32 %p2705, %f1467, %f1469; @%p2705 bra $L__BB1_1994; bra.uni $L__BB1_1993; $L__BB1_1994: mul.f32 %f5484, %f1441, %f1464; fma.rn.f32 %f5482, %f1439, %f1464, %f1436; mov.u32 %r5388, 0; mov.f32 %f5483, %f1437; mov.f32 %f5485, %f1464; bra.uni $L__BB1_1995; $L__BB1_1996: add.u64 %rd12940, %SP, 552; add.u64 %rd12946, %SP, 0; mul.f32 %f5015, %f1451, %f1459; fma.rn.f32 %f5016, %f1450, %f1458, %f5015; div.rn.f32 %f5486, %f5016, %f1463; fma.rn.f32 %f5017, %f1458, %f5486, %f1438; mov.b32 %r4536, %f5017; fma.rn.f32 %f5018, %f1459, %f5486, %f5483; mov.b32 %r4537, %f5018; cvt.u64.u32 %rd10589, %r4537; cvt.u64.u32 %rd10590, %r4536; bfi.b64 %rd4819, %rd10589, %rd10590, 32, 32; st.local.u64 [%rd4613], %rd4819; mov.u64 %rd12951, 2; mov.u64 %rd12937, %rd4626; mov.u64 %rd12938, %rd4625; mov.u64 %rd12939, %rd4625; mov.u64 %rd12941, %rd4625; mov.u64 %rd12942, %rd4625; mov.u64 %rd12943, %rd12940; mov.u64 %rd12944, %rd4613; mov.u64 %rd12945, %rd4613; mov.u64 %rd12947, %rd4613; mov.u64 %rd12948, %rd4613; mov.u64 %rd12949, %rd12946; mov.u64 %rd12950, %rd4624; $L__BB1_1997: setp.eq.s64 %p2706, %rd12951, 0; mov.u64 %rd12986, 1; @%p2706 bra $L__BB1_1999; add.s64 %rd12951, %rd12951, -1; add.s64 %rd10595, %rd12938, 8; setp.eq.s64 %p2707, %rd12941, %rd12937; selp.b64 %rd10596, %rd10595, %rd12941, %p2707; add.s64 %rd10597, %rd12939, 8; selp.b64 %rd10598, %rd10597, %rd12942, %p2707; add.s64 %rd10599, %rd12940, 8; selp.b64 %rd10600, %rd10599, %rd12943, %p2707; mov.u64 %rd12986, 0; setp.eq.s64 %p2708, %rd12951, 0; add.s64 %rd10601, %rd10596, 4; add.s64 %rd10602, %rd10598, 4; add.s64 %rd10603, %rd10600, 4; selp.b64 %rd4836, %rd10596, %rd10601, %p2708; selp.b64 %rd12942, %rd10598, %rd10602, %p2708; selp.b64 %rd12943, %rd10600, %rd10603, %p2708; selp.b64 %rd12938, %rd10595, %rd12938, %p2707; selp.b64 %rd12939, %rd10597, %rd12939, %p2707; selp.b64 %rd12940, %rd10599, %rd12940, %p2707; add.s64 %rd10604, %rd12941, 8; selp.b64 %rd12937, %rd10604, %rd12937, %p2707; add.s64 %rd10605, %rd12947, 8; setp.eq.s64 %p2709, %rd12944, %rd12950; selp.b64 %rd10606, %rd10605, %rd12944, %p2709; add.s64 %rd10607, %rd12948, 8; selp.b64 %rd10608, %rd10607, %rd12945, %p2709; add.s64 %rd10609, %rd12949, 8; selp.b64 %rd10610, %rd10609, %rd12946, %p2709; selp.b64 %rd12947, %rd10605, %rd12947, %p2709; selp.b64 %rd12948, %rd10607, %rd12948, %p2709; selp.b64 %rd12949, %rd10609, %rd12949, %p2709; add.s64 %rd10611, %rd12944, 8; selp.b64 %rd12950, %rd10611, %rd12950, %p2709; add.s64 %rd10612, %rd10606, 4; add.s64 %rd10613, %rd10608, 4; add.s64 %rd10614, %rd10610, 4; selp.b64 %rd12944, %rd10606, %rd10612, %p2708; selp.b64 %rd12945, %rd10608, %rd10613, %p2708; selp.b64 %rd12946, %rd10610, %rd10614, %p2708; ld.local.f32 %f5019, [%rd10608]; ld.local.f32 %f5020, [%rd10598]; setp.eq.f32 %p2710, %f5020, %f5019; mov.u64 %rd12941, %rd4836; @%p2710 bra $L__BB1_1997; $L__BB1_1999: mov.u64 %rd11283, 0; or.b64 %rd12985, %rd11283, %rd4819; mov.u32 %r5388, 1; bra.uni $L__BB1_2008; $L__BB1_2000: add.u64 %rd12956, %SP, 552; add.u64 %rd12962, %SP, 0; div.rn.f32 %f5486, %f1449, %f1462; fma.rn.f32 %f5021, %f1443, %f5486, %f1436; mov.b32 %r4539, %f5021; fma.rn.f32 %f5022, %f1445, %f5486, %f1437; mov.b32 %r4540, %f5022; cvt.u64.u32 %rd10616, %r4540; cvt.u64.u32 %rd10617, %r4539; bfi.b64 %rd4860, %rd10616, %rd10617, 32, 32; st.local.u64 [%rd4613], %rd4860; mov.u64 %rd12967, 2; mov.u64 %rd12953, %rd4626; mov.u64 %rd12954, %rd4625; mov.u64 %rd12955, %rd4625; mov.u64 %rd12957, %rd4625; mov.u64 %rd12958, %rd4625; mov.u64 %rd12959, %rd12956; mov.u64 %rd12960, %rd4613; mov.u64 %rd12961, %rd4613; mov.u64 %rd12963, %rd4613; mov.u64 %rd12964, %rd4613; mov.u64 %rd12965, %rd12962; mov.u64 %rd12966, %rd4622; $L__BB1_2001: setp.eq.s64 %p2711, %rd12967, 0; mov.u64 %rd12986, 1; @%p2711 bra $L__BB1_2003; add.s64 %rd12967, %rd12967, -1; add.s64 %rd10622, %rd12954, 8; setp.eq.s64 %p2712, %rd12957, %rd12953; selp.b64 %rd10623, %rd10622, %rd12957, %p2712; add.s64 %rd10624, %rd12955, 8; selp.b64 %rd10625, %rd10624, %rd12958, %p2712; add.s64 %rd10626, %rd12956, 8; selp.b64 %rd10627, %rd10626, %rd12959, %p2712; mov.u64 %rd12986, 0; setp.eq.s64 %p2713, %rd12967, 0; add.s64 %rd10628, %rd10623, 4; add.s64 %rd10629, %rd10625, 4; add.s64 %rd10630, %rd10627, 4; selp.b64 %rd4877, %rd10623, %rd10628, %p2713; selp.b64 %rd12958, %rd10625, %rd10629, %p2713; selp.b64 %rd12959, %rd10627, %rd10630, %p2713; selp.b64 %rd12954, %rd10622, %rd12954, %p2712; selp.b64 %rd12955, %rd10624, %rd12955, %p2712; selp.b64 %rd12956, %rd10626, %rd12956, %p2712; add.s64 %rd10631, %rd12957, 8; selp.b64 %rd12953, %rd10631, %rd12953, %p2712; add.s64 %rd10632, %rd12963, 8; setp.eq.s64 %p2714, %rd12960, %rd12966; selp.b64 %rd10633, %rd10632, %rd12960, %p2714; add.s64 %rd10634, %rd12964, 8; selp.b64 %rd10635, %rd10634, %rd12961, %p2714; add.s64 %rd10636, %rd12965, 8; selp.b64 %rd10637, %rd10636, %rd12962, %p2714; selp.b64 %rd12963, %rd10632, %rd12963, %p2714; selp.b64 %rd12964, %rd10634, %rd12964, %p2714; selp.b64 %rd12965, %rd10636, %rd12965, %p2714; add.s64 %rd10638, %rd12960, 8; selp.b64 %rd12966, %rd10638, %rd12966, %p2714; add.s64 %rd10639, %rd10633, 4; add.s64 %rd10640, %rd10635, 4; add.s64 %rd10641, %rd10637, 4; selp.b64 %rd12960, %rd10633, %rd10639, %p2713; selp.b64 %rd12961, %rd10635, %rd10640, %p2713; selp.b64 %rd12962, %rd10637, %rd10641, %p2713; ld.local.f32 %f5023, [%rd10635]; ld.local.f32 %f5024, [%rd10625]; setp.eq.f32 %p2715, %f5024, %f5023; mov.u64 %rd12957, %rd4877; @%p2715 bra $L__BB1_2001; $L__BB1_2003: mov.u64 %rd11284, 0; or.b64 %rd12985, %rd11284, %rd4860; mov.u32 %r5388, 2; bra.uni $L__BB1_2008; $L__BB1_2004: div.rn.f32 %f5486, %f1448, %f1461; fma.rn.f32 %f5025, %f1439, %f5486, %f1436; mov.b32 %r4542, %f5025; fma.rn.f32 %f5026, %f1441, %f5486, %f1437; mov.b32 %r4543, %f5026; cvt.u64.u32 %rd10643, %r4543; cvt.u64.u32 %rd10644, %r4542; bfi.b64 %rd4901, %rd10643, %rd10644, 32, 32; st.local.u64 [%rd4613], %rd4901; mov.u64 %rd12983, 2; mov.u64 %rd12969, %rd4626; mov.u64 %rd12970, %rd4625; mov.u64 %rd12971, %rd4625; mov.u64 %rd12972, %rd10373; mov.u64 %rd12973, %rd4625; mov.u64 %rd12974, %rd4625; mov.u64 %rd12975, %rd10373; mov.u64 %rd12976, %rd4613; mov.u64 %rd12977, %rd4613; mov.u64 %rd12978, %rd10367; mov.u64 %rd12979, %rd4613; mov.u64 %rd12980, %rd4613; mov.u64 %rd12981, %rd10367; mov.u64 %rd12982, %rd4620; $L__BB1_2005: setp.eq.s64 %p2716, %rd12983, 0; mov.u64 %rd12986, 1; @%p2716 bra $L__BB1_2007; add.s64 %rd12983, %rd12983, -1; add.s64 %rd10649, %rd12970, 8; setp.eq.s64 %p2717, %rd12973, %rd12969; selp.b64 %rd10650, %rd10649, %rd12973, %p2717; add.s64 %rd10651, %rd12971, 8; selp.b64 %rd10652, %rd10651, %rd12974, %p2717; add.s64 %rd10653, %rd12972, 8; selp.b64 %rd10654, %rd10653, %rd12975, %p2717; mov.u64 %rd12986, 0; setp.eq.s64 %p2718, %rd12983, 0; add.s64 %rd10655, %rd10650, 4; add.s64 %rd10656, %rd10652, 4; add.s64 %rd10657, %rd10654, 4; selp.b64 %rd4918, %rd10650, %rd10655, %p2718; selp.b64 %rd12974, %rd10652, %rd10656, %p2718; selp.b64 %rd12975, %rd10654, %rd10657, %p2718; selp.b64 %rd12970, %rd10649, %rd12970, %p2717; selp.b64 %rd12971, %rd10651, %rd12971, %p2717; selp.b64 %rd12972, %rd10653, %rd12972, %p2717; add.s64 %rd10658, %rd12973, 8; selp.b64 %rd12969, %rd10658, %rd12969, %p2717; add.s64 %rd10659, %rd12979, 8; setp.eq.s64 %p2719, %rd12976, %rd12982; selp.b64 %rd10660, %rd10659, %rd12976, %p2719; add.s64 %rd10661, %rd12980, 8; selp.b64 %rd10662, %rd10661, %rd12977, %p2719; add.s64 %rd10663, %rd12981, 8; selp.b64 %rd10664, %rd10663, %rd12978, %p2719; selp.b64 %rd12979, %rd10659, %rd12979, %p2719; selp.b64 %rd12980, %rd10661, %rd12980, %p2719; selp.b64 %rd12981, %rd10663, %rd12981, %p2719; add.s64 %rd10665, %rd12976, 8; selp.b64 %rd12982, %rd10665, %rd12982, %p2719; add.s64 %rd10666, %rd10660, 4; add.s64 %rd10667, %rd10662, 4; add.s64 %rd10668, %rd10664, 4; selp.b64 %rd12976, %rd10660, %rd10666, %p2718; selp.b64 %rd12977, %rd10662, %rd10667, %p2718; selp.b64 %rd12978, %rd10664, %rd10668, %p2718; ld.local.f32 %f5027, [%rd10662]; ld.local.f32 %f5028, [%rd10652]; setp.eq.f32 %p2720, %f5028, %f5027; mov.u64 %rd12973, %rd4918; @%p2720 bra $L__BB1_2005; $L__BB1_2007: mov.u64 %rd11285, 0; or.b64 %rd12985, %rd11285, %rd4901; mov.u32 %r5388, 0; $L__BB1_2008: mov.f32 %f5029, 0f3F800000; sub.f32 %f5030, %f5029, %f5486; mov.b32 %r4546, %f5030; mov.b32 %r4547, %f5486; cvt.u64.u32 %rd10669, %r4547; cvt.u64.u32 %rd10670, %r4546; bfi.b64 %rd13035, %rd10669, %rd10670, 32, 32; mov.b64 {%r4548, %r4549}, %rd12986; mov.b64 {%r4550, %r4551}, %rd12985; cvt.u32.u64 %r5386, %rd12985; mov.b32 %f5487, %r4551; mov.u32 %r5387, 1; mov.b32 {%rs1041, %rs927}, %r4548; bra.uni $L__BB1_2021; $L__BB1_1989: setp.lt.f32 %p2704, %f1468, %f1469; @%p2704 bra $L__BB1_1991; bra.uni $L__BB1_1990; $L__BB1_1991: mul.f32 %f5484, %f1445, %f1465; fma.rn.f32 %f5482, %f1443, %f1465, %f1436; mov.u32 %r5388, 2; mov.f32 %f5483, %f1437; mov.f32 %f5485, %f1465; bra.uni $L__BB1_1995; $L__BB1_1993: mul.f32 %f5484, %f1459, %f5485; fma.rn.f32 %f5482, %f1458, %f5485, %f1438; mov.u32 %r5388, 1; bra.uni $L__BB1_1995; $L__BB1_1990: mul.f32 %f5484, %f1459, %f5485; fma.rn.f32 %f5482, %f1458, %f5485, %f1438; mov.u32 %r5388, 1; $L__BB1_1995: add.f32 %f5487, %f5483, %f5484; mov.f32 %f5013, 0f3F800000; sub.f32 %f5014, %f5013, %f5485; mov.b32 %r4534, %f5014; mov.b32 %r4535, %f5485; cvt.u64.u32 %rd10586, %r4535; cvt.u64.u32 %rd10587, %r4534; bfi.b64 %rd13035, %rd10586, %rd10587, 32, 32; mov.b32 %r5386, %f5482; mov.u32 %r5387, 1; mov.u16 %rs1041, 1; $L__BB1_2021: mov.b32 %f5037, %r5386; sub.f32 %f5038, %f5037, %f1433; mul.f32 %f5039, %f5038, %f5038; sub.f32 %f5040, %f5487, %f1434; fma.rn.f32 %f5041, %f5040, %f5040, %f5039; add.f32 %f5042, %f5041, 0f00000000; sqrt.rn.f32 %f5043, %f5042; shl.b64 %rd10752, %rd4800, 2; add.s64 %rd10753, %rd24, %rd10752; st.local.f32 [%rd10753+-4], %f5043; mul.lo.s64 %rd10754, %rd4800, 36; add.s64 %rd10755, %rd4605, %rd10754; st.local.u32 [%rd10755+-36], %r5386; st.local.f32 [%rd10755+-32], %f5487; mov.u16 %rs940, 0; st.local.v4.u8 [%rd10755+-28], {%rs1041, %rs940, %rs940, %rs940}; st.local.u32 [%rd10755+-24], %r1491; st.local.u32 [%rd10755+-20], %r5387; st.local.u32 [%rd10755+-16], %r5388; shr.u64 %rd10756, %rd13035, 32; st.local.u32 [%rd10755+-8], %rd10756; st.local.u32 [%rd10755+-12], %rd13035; $L__BB1_2022: setp.lt.u64 %p2736, %rd4800, 4; add.s64 %rd4800, %rd4800, 1; @%p2736 bra $L__BB1_1969; ld.local.v2.u64 {%rd13036, %rd13037}, [%rd24]; ld.local.v4.u32 {%r5398, %r5399, %r5400, %r4579}, [%rd4605]; ld.local.u32 %r5401, [%rd4605+16]; ld.local.u32 %rd10759, [%rd4605+36]; ld.local.u32 %rd10760, [%rd4605+40]; bfi.b64 %rd10761, %rd10760, %rd10759, 32, 32; mov.b64 {%r5395, %r5396}, %rd10761; ld.local.u32 %r5397, [%rd4605+44]; ld.local.u32 %r5402, [%rd4605+52]; ld.local.u32 %r5394, [%rd4605+80]; ld.local.u64 %rd10762, [%rd4605+72]; mov.b64 {%r5392, %r5393}, %rd10762; ld.local.u32 %r5403, [%rd4605+88]; ld.local.u32 %rd10763, [%rd4605+108]; ld.local.u32 %rd10764, [%rd4605+112]; bfi.b64 %rd10765, %rd10764, %rd10763, 32, 32; mov.b64 {%r5389, %r5390}, %rd10765; ld.local.u32 %r5391, [%rd4605+116]; ld.local.u32 %r5404, [%rd4605+124]; bra.uni $L__BB1_2024; $L__BB1_1967: mov.u32 %r5401, 4; mov.u32 %r5402, %r5401; mov.u32 %r5403, %r5401; mov.u32 %r5404, %r5401; $L__BB1_2024: and.b64 %rd10766, %rd4795, 1; setp.eq.b64 %p2737, %rd10766, 1; mov.pred %p2738, 0; xor.pred %p2739, %p2737, %p2738; not.pred %p2740, %p2739; mov.b64 {%r1534, %r1535}, %rd13036; mov.b32 %f1492, %r1534; mov.b32 %f1493, %r1535; mov.b64 {%r1536, %r1537}, %rd13037; mov.b32 %f1494, %r1536; mov.b32 %f1495, %r1537; @%p2740 bra $L__BB1_2033; bra.uni $L__BB1_2025; $L__BB1_2033: and.b64 %rd10782, %rd4795, 2; setp.eq.s64 %p2754, %rd10782, 0; @%p2754 bra $L__BB1_2042; bra.uni $L__BB1_2034; $L__BB1_2042: and.b64 %rd10798, %rd4795, 4; setp.eq.s64 %p2768, %rd10798, 0; @%p2768 bra $L__BB1_2051; bra.uni $L__BB1_2043; $L__BB1_2051: and.b64 %rd10814, %rd4795, 8; setp.eq.s64 %p2782, %rd10814, 0; @%p2782 bra $L__BB1_1959; ld.u8 %rs947, [%rd4784+88]; and.b16 %rs948, %rs947, 1; setp.eq.b16 %p2783, %rs948, 1; mov.pred %p2784, 0; xor.pred %p2785, %p2783, %p2784; not.pred %p2786, %p2785; @%p2786 bra $L__BB1_2055; bra.uni $L__BB1_2053; $L__BB1_2055: ld.u32 %r1585, [%rd4784+76]; cvt.u64.u32 %rd10818, %r1585; setp.le.u64 %p2793, %rd4772, %rd10818; @%p2793 bra $L__BB1_1959; neg.f32 %f1499, %f1495; setp.lt.u32 %p2794, %r1490, 64; @%p2794 bra $L__BB1_2058; bra.uni $L__BB1_2057; $L__BB1_2058: mul.wide.u32 %rd10828, %r1490, 8; add.s64 %rd10829, %rd4609, %rd10828; mov.u64 %rd13044, 0; st.local.u32 [%rd10829], %r1585; st.local.f32 [%rd10829+4], %f1499; add.s32 %r1490, %r1490, 1; st.local.u32 [%rd4609+512], %r1490; mov.u64 %rd13045, %rd13044; bra.uni $L__BB1_2059; $L__BB1_2025: ld.u8 %rs941, [%rd4784+88]; and.b16 %rs942, %rs941, 1; setp.eq.b16 %p2741, %rs942, 1; xor.pred %p2743, %p2741, %p2738; not.pred %p2744, %p2743; @%p2744 bra $L__BB1_2028; bra.uni $L__BB1_2026; $L__BB1_2028: ld.u32 %r1543, [%rd4784+64]; cvt.u64.u32 %rd10770, %r1543; setp.le.u64 %p2751, %rd4772, %rd10770; @%p2751 bra $L__BB1_2033; neg.f32 %f1496, %f1492; setp.lt.u32 %p2752, %r1490, 64; @%p2752 bra $L__BB1_2031; bra.uni $L__BB1_2030; $L__BB1_2031: add.s32 %r4582, %r1489, -1; mul.wide.u32 %rd10780, %r4582, 8; add.s64 %rd10781, %rd4609, %rd10780; mov.u64 %rd13038, 0; st.local.u32 [%rd10781], %r1543; st.local.f32 [%rd10781+4], %f1496; add.s32 %r1490, %r1490, 1; st.local.u32 [%rd4609+512], %r1490; mov.u64 %rd13039, %rd13038; bra.uni $L__BB1_2032; $L__BB1_2034: ld.u8 %rs943, [%rd4784+88]; and.b16 %rs944, %rs943, 1; setp.eq.b16 %p2755, %rs944, 1; mov.pred %p2756, 0; xor.pred %p2757, %p2755, %p2756; not.pred %p2758, %p2757; @%p2758 bra $L__BB1_2037; bra.uni $L__BB1_2035; $L__BB1_2037: ld.u32 %r1557, [%rd4784+68]; cvt.u64.u32 %rd10786, %r1557; setp.le.u64 %p2765, %rd4772, %rd10786; @%p2765 bra $L__BB1_2042; neg.f32 %f1497, %f1493; setp.lt.u32 %p2766, %r1490, 64; @%p2766 bra $L__BB1_2040; bra.uni $L__BB1_2039; $L__BB1_2040: mul.wide.u32 %rd10796, %r1490, 8; add.s64 %rd10797, %rd4609, %rd10796; mov.u64 %rd13040, 0; st.local.u32 [%rd10797], %r1557; st.local.f32 [%rd10797+4], %f1497; add.s32 %r1490, %r1490, 1; st.local.u32 [%rd4609+512], %r1490; mov.u64 %rd13041, %rd13040; bra.uni $L__BB1_2041; $L__BB1_2043: ld.u8 %rs945, [%rd4784+88]; and.b16 %rs946, %rs945, 1; setp.eq.b16 %p2769, %rs946, 1; mov.pred %p2770, 0; xor.pred %p2771, %p2769, %p2770; not.pred %p2772, %p2771; @%p2772 bra $L__BB1_2046; bra.uni $L__BB1_2044; $L__BB1_2046: ld.u32 %r1571, [%rd4784+72]; cvt.u64.u32 %rd10802, %r1571; setp.le.u64 %p2779, %rd4772, %rd10802; @%p2779 bra $L__BB1_2051; neg.f32 %f1498, %f1494; setp.lt.u32 %p2780, %r1490, 64; @%p2780 bra $L__BB1_2049; bra.uni $L__BB1_2048; $L__BB1_2049: mul.wide.u32 %rd10812, %r1490, 8; add.s64 %rd10813, %rd4609, %rd10812; mov.u64 %rd13042, 0; st.local.u32 [%rd10813], %r1571; st.local.f32 [%rd10813+4], %f1498; add.s32 %r1490, %r1490, 1; st.local.u32 [%rd4609+512], %r1490; mov.u64 %rd13043, %rd13042; bra.uni $L__BB1_2050; $L__BB1_2026: setp.leu.f32 %p2745, %f1435, %f1492; setp.eq.s32 %p2746, %r5401, 4; or.pred %p2747, %p2746, %p2745; @%p2747 bra $L__BB1_2033; ld.u32 %r4580, [%rd4784+64]; cvt.u64.u32 %rd10767, %r4580; setp.le.u64 %p2748, %rd4775, %rd10767; mul.wide.u32 %rd10768, %r4580, 12; add.s64 %rd10769, %rd4776, %rd10768; setp.eq.s64 %p2749, %rd10769, 0; or.pred %p2750, %p2748, %p2749; selp.b32 %r1485, %r1485, %r5400, %p2750; selp.b32 %r1484, %r1484, %r5399, %p2750; selp.b32 %r1483, %r1483, %r5398, %p2750; selp.b32 %r1487, %r1487, %r5401, %p2750; selp.b32 %r1488, %r1488, %r1534, %p2750; bra.uni $L__BB1_2033; $L__BB1_2053: mov.b32 %f5046, %r1488; setp.leu.f32 %p2787, %f5046, %f1495; setp.eq.s32 %p2788, %r5404, 4; or.pred %p2789, %p2788, %p2787; @%p2789 bra $L__BB1_1959; bra.uni $L__BB1_2054; $L__BB1_2035: mov.b32 %f5044, %r1488; setp.leu.f32 %p2759, %f5044, %f1493; setp.eq.s32 %p2760, %r5402, 4; or.pred %p2761, %p2760, %p2759; @%p2761 bra $L__BB1_2042; ld.u32 %r4588, [%rd4784+68]; cvt.u64.u32 %rd10783, %r4588; setp.le.u64 %p2762, %rd4775, %rd10783; mul.wide.u32 %rd10784, %r4588, 12; add.s64 %rd10785, %rd4776, %rd10784; setp.eq.s64 %p2763, %rd10785, 0; or.pred %p2764, %p2762, %p2763; selp.b32 %r1485, %r1485, %r5397, %p2764; selp.b32 %r1484, %r1484, %r5396, %p2764; selp.b32 %r1483, %r1483, %r5395, %p2764; selp.b32 %r1487, %r1487, %r5402, %p2764; selp.b32 %r1488, %r1488, %r1535, %p2764; bra.uni $L__BB1_2042; $L__BB1_2044: mov.b32 %f5045, %r1488; setp.leu.f32 %p2773, %f5045, %f1494; setp.eq.s32 %p2774, %r5403, 4; or.pred %p2775, %p2774, %p2773; @%p2775 bra $L__BB1_2051; ld.u32 %r4595, [%rd4784+72]; cvt.u64.u32 %rd10799, %r4595; setp.le.u64 %p2776, %rd4775, %rd10799; mul.wide.u32 %rd10800, %r4595, 12; add.s64 %rd10801, %rd4776, %rd10800; setp.eq.s64 %p2777, %rd10801, 0; or.pred %p2778, %p2776, %p2777; selp.b32 %r1485, %r1485, %r5394, %p2778; selp.b32 %r1484, %r1484, %r5393, %p2778; selp.b32 %r1483, %r1483, %r5392, %p2778; selp.b32 %r1487, %r1487, %r5403, %p2778; selp.b32 %r1488, %r1488, %r1536, %p2778; bra.uni $L__BB1_2051; $L__BB1_2057: mov.u64 %rd13045, 1; shl.b64 %rd13044, %rd10818, 32; $L__BB1_2059: mov.u64 %rd11298, 0; cvt.u32.u64 %r4604, %rd11298; cvt.u32.u64 %r4605, %rd13044; or.b32 %r4606, %r4605, %r4604; cvt.u32.u64 %r4607, %rd13045; or.b32 %r4608, %r4606, %r4607; setp.eq.s32 %p2795, %r4608, 0; @%p2795 bra $L__BB1_1959; bra.uni $L__BB1_2060; $L__BB1_2030: mov.u64 %rd13039, 1; shl.b64 %rd13038, %rd10770, 32; $L__BB1_2032: mov.u64 %rd11289, 0; cvt.u32.u64 %r4583, %rd11289; cvt.u32.u64 %r4584, %rd13038; or.b32 %r4585, %r4584, %r4583; cvt.u32.u64 %r4586, %rd13039; or.b32 %r4587, %r4585, %r4586; setp.ne.s32 %p2753, %r4587, 0; @%p2753 bra $L__BB1_2060; bra.uni $L__BB1_2033; $L__BB1_2039: mov.u64 %rd13041, 1; shl.b64 %rd13040, %rd10786, 32; $L__BB1_2041: mov.u64 %rd11292, 0; cvt.u32.u64 %r4590, %rd11292; cvt.u32.u64 %r4591, %rd13040; or.b32 %r4592, %r4591, %r4590; cvt.u32.u64 %r4593, %rd13041; or.b32 %r4594, %r4592, %r4593; setp.ne.s32 %p2767, %r4594, 0; @%p2767 bra $L__BB1_2060; bra.uni $L__BB1_2042; $L__BB1_2048: mov.u64 %rd13043, 1; shl.b64 %rd13042, %rd10802, 32; $L__BB1_2050: mov.u64 %rd11295, 0; cvt.u32.u64 %r4597, %rd11295; cvt.u32.u64 %r4598, %rd13042; or.b32 %r4599, %r4598, %r4597; cvt.u32.u64 %r4600, %rd13043; or.b32 %r4601, %r4599, %r4600; setp.ne.s32 %p2781, %r4601, 0; @%p2781 bra $L__BB1_2060; bra.uni $L__BB1_2051; $L__BB1_2061: setp.eq.s32 %p2796, %r1487, 4; mov.u64 %rd13046, %rd10502; mov.u64 %rd13047, %rd10500; mov.u64 %rd13048, %rd10502; @%p2796 bra $L__BB1_2063; mov.b64 %rd13048, {%r1483, %r1484}; mov.b32 {%rs949, %rs950}, %r1485; mov.b64 %rd10836, {%r1485, %r4609}; and.b64 %rd13046, %rd10836, 4294967040; cvt.u64.u16 %rd10837, %rs949; and.b64 %rd13047, %rd10837, 255; $L__BB1_2063: or.b64 %rd10844, %rd13047, %rd13046; or.b64 %rd10845, %rd10844, %rd10502; mov.b64 {%r4610, %r4611}, %rd10845; mov.b32 {%rs202, %rs951}, %r4610; and.b16 %rs952, %rs202, 255; setp.eq.s16 %p2797, %rs952, 2; @%p2797 bra $L__BB1_2065; cvt.u32.u64 %r4612, %rd13048; mov.b32 %f5047, %r4612; shr.u64 %rd10846, %rd13048, 32; cvt.u32.u64 %r4613, %rd10846; mov.b32 %f5048, %r4613; mul.f32 %f5049, %f1432, %f5047; mul.f32 %f5050, %f1431, %f5048; sub.f32 %f5051, %f5049, %f5050; mul.f32 %f5052, %f1432, %f5048; fma.rn.f32 %f5053, %f1431, %f5047, %f5052; add.f32 %f5054, %f1429, %f5051; mov.b32 %r4614, %f5054; add.f32 %f5055, %f1430, %f5053; mov.b32 %r4615, %f5055; cvt.u64.u32 %rd10847, %r4615; cvt.u64.u32 %rd10848, %r4614; cvt.u64.u16 %rd10849, %rs202; bfi.b64 %rd10502, %rd10847, %rd10848, 32, 32; and.b64 %rd10850, %rd10849, 255; mov.b64 {%r4616, %r4617}, %rd10850; mov.b32 {%rs953, %rs954}, %r4616; cvt.u64.u16 %rd10500, %rs953; $L__BB1_2065: mov.u64 %rd11307, 0; or.b64 %rd10857, %rd11307, %rd10500; or.b64 %rd5124, %rd10857, %rd11307; mov.b64 {%r4618, %r4619}, %rd5124; mov.b32 {%rs203, %rs955}, %r4618; and.b16 %rs956, %rs203, 255; setp.eq.s16 %p2798, %rs956, 2; mov.u64 %rd13051, 2; mov.u64 %rd13052, %rd11307; mov.u64 %rd13053, %rd11307; @%p2798 bra $L__BB1_2067; and.b64 %rd10859, %rd5124, 4294967040; cvt.u64.u16 %rd10860, %rs203; and.b64 %rd10861, %rd10860, 255; or.b64 %rd10862, %rd10861, %rd11307; or.b64 %rd10863, %rd10862, %rd10859; mov.b64 {%r4620, %r4621}, %rd10863; mov.b32 {%rs957, %rs958}, %r4620; not.b16 %rs959, %rs957; ld.global.u8 %rs960, [%rd4641+240]; setp.eq.s16 %p2799, %rs960, 0; and.b16 %rs961, %rs959, 1; selp.b16 %rs962, %rs957, %rs961, %p2799; and.b64 %rd10864, %rd10863, 4294967040; cvt.u64.u16 %rd10865, %rs962; and.b64 %rd10866, %rd10865, 255; or.b64 %rd10867, %rd10864, %rd11307; or.b64 %rd10868, %rd10867, %rd10866; mov.b64 {%r4622, %r4623}, %rd10868; mov.b32 {%rs963, %rs964}, %r4622; and.b64 %rd13053, %rd10868, 4294967040; cvt.u64.u16 %rd10869, %rs963; and.b64 %rd13051, %rd10869, 255; mov.u64 %rd13052, %rd10502; $L__BB1_2067: or.b64 %rd10870, %rd13052, %rd11307; or.b64 %rd10871, %rd11307, %rd13051; or.b64 %rd10872, %rd10871, %rd13053; or.b64 %rd10873, %rd10870, %rd11307; mov.b64 {%r5435, %r5436}, %rd10873; mov.b64 {%r5437, %r4624}, %rd10872; bra.uni $L__BB1_2124; $L__BB1_1927: cvt.u32.u64 %r4417, %rd4646; cvt.u32.u64 %r4418, %rd4671; rem.u32 %r4419, %r4418, %r4417; cvt.u64.u32 %rd12893, %r4419; $L__BB1_1928: shl.b64 %rd10407, %rd12893, 3; add.s64 %rd4675, %rd4647, %rd10407; ld.u32 %rd10408, [%rd4675]; ld.u32 %rd10409, [%rd4675+4]; bfi.b64 %rd4676, %rd10409, %rd10408, 32, 32; add.s64 %rd4677, %rd12893, 1; or.b64 %rd10410, %rd4677, %rd4646; and.b64 %rd10411, %rd10410, -4294967296; setp.eq.s64 %p2617, %rd10411, 0; @%p2617 bra $L__BB1_1930; rem.u64 %rd12894, %rd4677, %rd4646; bra.uni $L__BB1_1931; $L__BB1_1930: cvt.u32.u64 %r4420, %rd4646; cvt.u32.u64 %r4421, %rd4677; rem.u32 %r4422, %r4421, %r4420; cvt.u64.u32 %rd12894, %r4422; $L__BB1_1931: add.u64 %rd12904, %SP, 560; shl.b64 %rd10413, %rd12894, 3; add.s64 %rd4687, %rd4647, %rd10413; ld.u32 %rd10414, [%rd4687]; ld.u32 %rd10415, [%rd4687+4]; bfi.b64 %rd10416, %rd10415, %rd10414, 32, 32; st.local.v2.u64 [%rd4605], {%rd4676, %rd10416}; mov.u64 %rd12909, 2; mov.u64 %rd12895, %rd4634; mov.u64 %rd12896, %rd4632; mov.u64 %rd12897, %rd4632; mov.u64 %rd12898, %rd4633; mov.u64 %rd12899, %rd4632; mov.u64 %rd12900, %rd4632; mov.u64 %rd12901, %rd4633; mov.u64 %rd12902, %rd4605; mov.u64 %rd12903, %rd4605; mov.u64 %rd12905, %rd4605; mov.u64 %rd12906, %rd4605; mov.u64 %rd12907, %rd12904; mov.u64 %rd12908, %rd4635; $L__BB1_1932: setp.eq.s64 %p2618, %rd12909, 0; @%p2618 bra $L__BB1_1935; add.s64 %rd12909, %rd12909, -1; add.s64 %rd10417, %rd12896, 8; setp.eq.s64 %p2619, %rd12899, %rd12895; selp.b64 %rd10418, %rd10417, %rd12899, %p2619; add.s64 %rd10419, %rd12897, 8; selp.b64 %rd10420, %rd10419, %rd12900, %p2619; add.s64 %rd10421, %rd12898, 8; selp.b64 %rd10422, %rd10421, %rd12901, %p2619; setp.eq.s64 %p2620, %rd12909, 0; add.s64 %rd10423, %rd10418, 4; add.s64 %rd10424, %rd10420, 4; add.s64 %rd10425, %rd10422, 4; selp.b64 %rd4704, %rd10418, %rd10423, %p2620; selp.b64 %rd12900, %rd10420, %rd10424, %p2620; selp.b64 %rd12901, %rd10422, %rd10425, %p2620; selp.b64 %rd12896, %rd10417, %rd12896, %p2619; selp.b64 %rd12897, %rd10419, %rd12897, %p2619; selp.b64 %rd12898, %rd10421, %rd12898, %p2619; add.s64 %rd10426, %rd12899, 8; selp.b64 %rd12895, %rd10426, %rd12895, %p2619; add.s64 %rd10427, %rd12905, 8; setp.eq.s64 %p2621, %rd12902, %rd12908; selp.b64 %rd10428, %rd10427, %rd12902, %p2621; add.s64 %rd10429, %rd12906, 8; selp.b64 %rd10430, %rd10429, %rd12903, %p2621; add.s64 %rd10431, %rd12907, 8; selp.b64 %rd10432, %rd10431, %rd12904, %p2621; selp.b64 %rd12905, %rd10427, %rd12905, %p2621; selp.b64 %rd12906, %rd10429, %rd12906, %p2621; selp.b64 %rd12907, %rd10431, %rd12907, %p2621; add.s64 %rd10433, %rd12902, 8; selp.b64 %rd12908, %rd10433, %rd12908, %p2621; add.s64 %rd10434, %rd10428, 4; add.s64 %rd10435, %rd10430, 4; add.s64 %rd10436, %rd10432, 4; selp.b64 %rd12902, %rd10428, %rd10434, %p2620; selp.b64 %rd12903, %rd10430, %rd10435, %p2620; selp.b64 %rd12904, %rd10432, %rd10436, %p2620; ld.local.f32 %f4818, [%rd10430]; ld.local.f32 %f4819, [%rd10420]; setp.eq.f32 %p2622, %f4819, %f4818; mov.u64 %rd12899, %rd4704; @%p2622 bra $L__BB1_1932; bra.uni $L__BB1_1934; $L__BB1_1935: ld.u32 %rd10437, [%rd4675]; ld.u32 %rd10438, [%rd4675+4]; bfi.b64 %rd10439, %rd10438, %rd10437, 32, 32; cvt.u32.u64 %r4423, %rd10439; mov.b32 %f4820, %r4423; shr.u64 %rd10440, %rd10439, 32; cvt.u32.u64 %r4424, %rd10440; mov.b32 %f4821, %r4424; ld.u32 %rd10441, [%rd4687]; ld.u32 %rd10442, [%rd4687+4]; bfi.b64 %rd10443, %rd10442, %rd10441, 32, 32; cvt.u32.u64 %r4425, %rd10443; shr.u64 %rd10444, %rd10443, 32; cvt.u32.u64 %r4426, %rd10444; mov.b32 %f4822, %r4425; sub.f32 %f5480, %f4822, %f4820; mov.b32 %f4823, %r4426; sub.f32 %f5481, %f4823, %f4821; bra.uni $L__BB1_1946; $L__BB1_1940: cvt.u32.u64 %r4427, %rd4646; cvt.u32.u64 %r4428, %rd4718; rem.u32 %r4429, %r4428, %r4427; cvt.u64.u32 %rd12910, %r4429; $L__BB1_1941: shl.b64 %rd10453, %rd12910, 3; add.s64 %rd10454, %rd4647, %rd10453; ld.u32 %rd10455, [%rd10454]; ld.u32 %rd10456, [%rd10454+4]; bfi.b64 %rd4729, %rd10456, %rd10455, 32, 32; st.local.v2.u64 [%rd4605], {%rd4719, %rd4729}; mov.u64 %rd12925, 2; mov.u64 %rd12911, %rd4632; mov.u64 %rd12912, %rd4628; mov.u64 %rd12913, %rd4628; mov.u64 %rd12914, %rd4631; mov.u64 %rd12915, %rd4628; mov.u64 %rd12916, %rd4628; mov.u64 %rd12917, %rd4631; mov.u64 %rd12918, %rd4636; mov.u64 %rd12919, %rd4636; mov.u64 %rd12920, %rd4637; mov.u64 %rd12921, %rd4636; mov.u64 %rd12922, %rd4636; mov.u64 %rd12923, %rd4637; mov.u64 %rd12924, %rd4638; $L__BB1_1942: setp.eq.s64 %p2626, %rd12925, 0; @%p2626 bra $L__BB1_1945; add.s64 %rd12925, %rd12925, -1; add.s64 %rd10457, %rd12912, 8; setp.eq.s64 %p2627, %rd12915, %rd12911; selp.b64 %rd10458, %rd10457, %rd12915, %p2627; add.s64 %rd10459, %rd12913, 8; selp.b64 %rd10460, %rd10459, %rd12916, %p2627; add.s64 %rd10461, %rd12914, 8; selp.b64 %rd10462, %rd10461, %rd12917, %p2627; setp.eq.s64 %p2628, %rd12925, 0; add.s64 %rd10463, %rd10458, 4; add.s64 %rd10464, %rd10460, 4; add.s64 %rd10465, %rd10462, 4; selp.b64 %rd4746, %rd10458, %rd10463, %p2628; selp.b64 %rd12916, %rd10460, %rd10464, %p2628; selp.b64 %rd12917, %rd10462, %rd10465, %p2628; selp.b64 %rd12912, %rd10457, %rd12912, %p2627; selp.b64 %rd12913, %rd10459, %rd12913, %p2627; selp.b64 %rd12914, %rd10461, %rd12914, %p2627; add.s64 %rd10466, %rd12915, 8; selp.b64 %rd12911, %rd10466, %rd12911, %p2627; add.s64 %rd10467, %rd12921, 8; setp.eq.s64 %p2629, %rd12918, %rd12924; selp.b64 %rd10468, %rd10467, %rd12918, %p2629; add.s64 %rd10469, %rd12922, 8; selp.b64 %rd10470, %rd10469, %rd12919, %p2629; add.s64 %rd10471, %rd12923, 8; selp.b64 %rd10472, %rd10471, %rd12920, %p2629; selp.b64 %rd12921, %rd10467, %rd12921, %p2629; selp.b64 %rd12922, %rd10469, %rd12922, %p2629; selp.b64 %rd12923, %rd10471, %rd12923, %p2629; add.s64 %rd10473, %rd12918, 8; selp.b64 %rd12924, %rd10473, %rd12924, %p2629; add.s64 %rd10474, %rd10468, 4; add.s64 %rd10475, %rd10470, 4; add.s64 %rd10476, %rd10472, 4; selp.b64 %rd12918, %rd10468, %rd10474, %p2628; selp.b64 %rd12919, %rd10470, %rd10475, %p2628; selp.b64 %rd12920, %rd10472, %rd10476, %p2628; ld.local.f32 %f4824, [%rd10470]; ld.local.f32 %f4825, [%rd10460]; setp.eq.f32 %p2630, %f4825, %f4824; mov.u64 %rd12915, %rd4746; @%p2630 bra $L__BB1_1942; bra.uni $L__BB1_1944; $L__BB1_1945: cvt.u32.u64 %r4430, %rd4719; mov.b32 %f4826, %r4430; shr.u64 %rd10477, %rd4719, 32; cvt.u32.u64 %r4431, %rd10477; mov.b32 %f4827, %r4431; shr.u64 %rd10478, %rd4729, 32; cvt.u32.u64 %r4432, %rd10478; cvt.u32.u64 %r4433, %rd4729; mov.b32 %f4828, %r4433; sub.f32 %f4829, %f4828, %f4826; mov.b32 %f4830, %r4432; sub.f32 %f4831, %f4830, %f4827; neg.f32 %f5480, %f4829; neg.f32 %f5481, %f4831; $L__BB1_1946: mul.f32 %f4832, %f1421, %f5481; fma.rn.f32 %f1428, %f1420, %f5480, %f4832; mul.f32 %f4833, %f5481, %f5481; fma.rn.f32 %f4834, %f5480, %f5480, %f4833; add.f32 %f4835, %f4834, 0f00000000; sqrt.rn.f32 %f4836, %f4835; mul.f32 %f4837, %f4836, 0f3A83126F; abs.f32 %f4838, %f1428; setp.gt.f32 %p2631, %f4838, %f4837; @%p2631 bra $L__BB1_1948; bra.uni $L__BB1_1947; $L__BB1_1948: setp.ge.f32 %p2958, %f1428, 0f00000000; bra.uni $L__BB1_1951; $L__BB1_1947: ld.local.u64 %rd10479, [%rd4609+8]; cvt.u32.u64 %r4434, %rd10479; mov.b32 %f4839, %r4434; shr.u64 %rd10480, %rd10479, 32; cvt.u32.u64 %r4435, %rd10480; mov.b32 %f4840, %r4435; sub.f32 %f4841, %f1208, %f4839; sub.f32 %f4842, %f1209, %f4840; mul.f32 %f4843, %f1421, %f4842; fma.rn.f32 %f4844, %f1420, %f4841, %f4843; setp.le.f32 %p2958, %f4844, 0f00000000; $L__BB1_1951: selp.u16 %rs900, 1, 0, %p2958; st.local.u8 [%rd4609+16], %rs900; $L__BB1_1952: setp.eq.s32 %p2959, %r1469, 2; ld.local.v2.u32 {%r5374, %r5375}, [%rd4609+8]; ld.local.u32 %r5376, [%rd4609+16]; $L__BB1_1954: mov.u64 %rd10488, 0; mov.u64 %rd12926, 2; mov.u64 %rd12927, %rd10488; @%p2959 bra $L__BB1_1956; setp.ne.s16 %p2632, %rs187, 0; cvt.u16.u32 %rs902, %r5376; selp.u16 %rs903, 1, 0, %p2632; xor.b16 %rs904, %rs902, %rs903; mov.b32 %f4851, %r5374; mov.b32 %f4852, %r5375; mul.f32 %f4853, %f1391, %f4851; mul.f32 %f4854, %f1390, %f4852; sub.f32 %f4855, %f4853, %f4854; mul.f32 %f4856, %f1390, %f4851; fma.rn.f32 %f4857, %f1391, %f4852, %f4856; add.f32 %f4858, %f1388, %f4855; mov.b32 %r4440, %f4858; add.f32 %f4859, %f1389, %f4857; mov.b32 %r4441, %f4859; cvt.u64.u32 %rd10489, %r4441; cvt.u64.u32 %rd10490, %r4440; cvt.u64.u16 %rd10491, %rs904; bfi.b64 %rd12927, %rd10489, %rd10490, 32, 32; and.b64 %rd10492, %rd10491, 255; mov.b64 {%r4442, %r4443}, %rd10492; mov.b32 {%rs905, %rs906}, %r4442; cvt.u64.u16 %rd12926, %rs905; $L__BB1_1956: or.b64 %rd10493, %rd10488, %rd10488; or.b64 %rd10494, %rd12926, %rd10488; or.b64 %rd10495, %rd10494, %rd10488; or.b64 %rd10496, %rd10493, %rd12927; mov.b64 {%r5435, %r5436}, %rd10496; mov.b64 {%r5437, %r4444}, %rd10495; $L__BB1_2124: mov.b32 {%rs208, %rs974}, %r5437; and.b16 %rs975, %rs208, 255; setp.eq.s16 %p2877, %rs975, 2; @%p2877 bra $L__BB1_2126; mov.b64 %rd10946, {%r5437, %r4688}; shr.u64 %rd10947, %rd10946, 8; and.b64 %rd10948, %rd10947, 16777215; cvt.u64.u16 %rd10949, %rs208; and.b64 %rd10950, %rd10949, 255; mov.b64 %rd10951, {%r5435, %r5436}; bfi.b64 %rd10952, %rd10948, %rd10950, 8, 56; mov.b64 {%r4689, %r4690}, %rd10952; mov.b32 {%rs976, %rs977}, %r4689; shr.u64 %rd10953, %rd10951, 32; cvt.u32.u64 %r4691, %rd10953; mov.b32 %f5156, %r5435; sub.f32 %f5157, %f5156, %f1208; mov.b32 %f5158, %r4691; sub.f32 %f5159, %f5158, %f1209; mul.f32 %f5160, %f5159, %f5159; fma.rn.f32 %f5161, %f5157, %f5157, %f5160; add.f32 %f5162, %f5161, 0f00000000; sqrt.rn.f32 %f5163, %f5162; and.b16 %rs978, %rs976, 1; setp.eq.b16 %p2878, %rs978, 1; selp.f32 %f5164, 0fBF800000, 0f3F800000, %p2878; mul.f32 %f5165, %f5164, %f5163; setp.ge.f32 %p2879, %f5165, %f1387; setp.le.f32 %p2880, %f5165, %f1387; selp.b16 %rs979, 1, 2, %p2880; setp.gtu.f32 %p2881, %f5165, %f1387; selp.b16 %rs980, -1, 0, %p2881; selp.b16 %rs981, %rs980, %rs979, %p2879; setp.eq.s16 %p2882, %rs981, 1; selp.f32 %f1387, %f5165, %f1387, %p2882; $L__BB1_2126: add.s64 %rd4641, %rd4641, 280; add.s64 %rd4642, %rd4642, 280; setp.ne.s64 %p2883, %rd4049, 0; add.s64 %rd4640, %rd4640, 280; @%p2883 bra $L__BB1_1896; $L__BB1_2127: setp.eq.s32 %p2884, %r1259, 0; @%p2884 bra $L__BB1_2129; ld.param.f32 %f5237, [grid_update_param_1]; sub.f32 %f5167, %f1207, %f1387; div.rn.f32 %f5168, %f5167, %f5237; div.rn.f32 %f5169, %f5168, 0f3DCCCCCD; mul.f32 %f4766, %f5169, 0f3F000000; $L__BB1_2129: mul.f32 %f5170, %f4766, %f4766; fma.rn.f32 %f5171, %f858, %f858, %f5170; add.f32 %f5172, %f5171, 0f00000000; sqrt.rn.f32 %f1569, %f5172; setp.le.f32 %p2885, %f1569, 0f3727C5AC; mov.u64 %rd10957, 0; mov.u64 %rd13067, %rd10957; mov.u64 %rd13068, %rd10957; mov.u64 %rd13069, %rd10957; @%p2885 bra $L__BB1_2131; div.rn.f32 %f5173, %f858, %f1569; mov.b32 %r4692, %f5173; div.rn.f32 %f5174, %f4766, %f1569; mov.b32 %r4693, %f5174; cvt.u64.u32 %rd10960, %r4693; cvt.u64.u32 %rd10961, %r4692; bfi.b64 %rd10962, %rd10960, %rd10961, 32, 32; shr.u64 %rd13068, %rd10962, 32; shl.b64 %rd13067, %rd10962, 32; mov.u64 %rd13069, 1; $L__BB1_2131: or.b64 %rd10963, %rd10957, %rd13068; or.b64 %rd10964, %rd13069, %rd13067; mov.b64 {%r4694, %r4695}, %rd10964; setp.eq.s32 %p2886, %r4694, 0; mov.b64 {%r4696, %r4697}, %rd10963; mov.b64 %rd10965, {%r4695, %r4696}; selp.b64 %rd10966, 0, %rd10965, %p2886; st.global.u64 [%rd13+36], %rd10966; ld.global.u64 %rd13070, [%rd13+20]; $L__BB1_2132: mov.b32 %r5439, %f4; mov.b32 %r5438, %f5; cvt.u16.u64 %rs982, %rd13070; shl.b16 %rs983, %rs982, 14; add.s16 %rs984, %rs983, -16384; shr.s16 %rs985, %rs984, 14; setp.lt.s16 %p2887, %rs985, 0; @%p2887 bra $L__BB1_2147; ld.param.u64 %rd11366, [grid_update_param_3]; ld.global.u64 %rd5188, [%rd13+28]; setp.ge.u64 %p2888, %rd5188, %rd11366; mul.lo.s64 %rd10967, %rd5188, 280; add.s64 %rd10968, %rd5233, %rd10967; setp.eq.s64 %p2889, %rd10968, 0; or.pred %p2890, %p2888, %p2889; @%p2890 bra $L__BB1_2149; cvta.to.global.u64 %rd10969, %rd5233; add.s64 %rd5189, %rd10969, %rd10967; ld.global.u16 %rs986, [%rd5189+272]; setp.eq.s16 %p2891, %rs986, 0; @%p2891 bra $L__BB1_2146; mov.b32 %r5439, %f4; mov.b32 %r5438, %f5; setp.eq.s16 %p2892, %rs986, 3; @%p2892 bra $L__BB1_2147; add.u64 %rd13078, %SPL, 32; ld.param.u64 %rd11333, [grid_update_param_1+8]; add.u64 %rd13080, %SP, 32; mul.lo.s64 %rd11331, %rd5249, 56; mov.u32 %r4716, %tid.x; mov.u32 %r4715, %ctaid.x; mul.wide.u32 %rd11330, %r4715, 16; cvt.u64.u32 %rd11329, %r4716; mov.u32 %r4714, %tid.y; mul.wide.u32 %rd11328, %r4714, 4; add.s64 %rd11327, %rd11329, %rd11330; add.s64 %rd11326, %rd11327, %rd11328; mul.lo.s64 %rd11325, %rd11326, 56; cvta.to.global.u64 %rd11324, %rd11333; add.s64 %rd11323, %rd11324, %rd11325; mov.u64 %rd10973, 0; mov.u64 %rd13085, 2; add.s64 %rd13072, %rd11323, 40; add.s64 %rd10981, %rd11333, %rd11325; add.s64 %rd13074, %rd10981, 40; st.local.u64 [%rd13078], %rd10973; add.s64 %rd13071, %rd11323, 48; add.s64 %rd13084, %rd13078, 8; mov.u64 %rd13073, %rd13072; mov.u64 %rd13075, %rd13072; mov.u64 %rd13076, %rd13072; mov.u64 %rd13077, %rd13074; mov.u64 %rd13079, %rd13078; mov.u64 %rd13081, %rd13078; mov.u64 %rd13082, %rd13078; mov.u64 %rd13083, %rd13080; $L__BB1_2137: setp.eq.s64 %p2893, %rd13085, 0; @%p2893 bra $L__BB1_2145; add.s64 %rd13085, %rd13085, -1; add.s64 %rd10982, %rd13072, 8; setp.eq.s64 %p2894, %rd13075, %rd13071; selp.b64 %rd10983, %rd10982, %rd13075, %p2894; add.s64 %rd10984, %rd13073, 8; selp.b64 %rd10985, %rd10984, %rd13076, %p2894; add.s64 %rd10986, %rd13074, 8; selp.b64 %rd10987, %rd10986, %rd13077, %p2894; setp.eq.s64 %p2895, %rd13085, 0; add.s64 %rd10988, %rd10983, 4; add.s64 %rd10989, %rd10985, 4; add.s64 %rd10990, %rd10987, 4; selp.b64 %rd5212, %rd10983, %rd10988, %p2895; selp.b64 %rd13076, %rd10985, %rd10989, %p2895; selp.b64 %rd13077, %rd10987, %rd10990, %p2895; selp.b64 %rd13072, %rd10982, %rd13072, %p2894; selp.b64 %rd13073, %rd10984, %rd13073, %p2894; selp.b64 %rd13074, %rd10986, %rd13074, %p2894; add.s64 %rd10991, %rd13075, 8; selp.b64 %rd13071, %rd10991, %rd13071, %p2894; add.s64 %rd10992, %rd13081, 8; setp.eq.s64 %p2896, %rd13078, %rd13084; selp.b64 %rd10993, %rd10992, %rd13078, %p2896; add.s64 %rd10994, %rd13082, 8; selp.b64 %rd10995, %rd10994, %rd13079, %p2896; add.s64 %rd10996, %rd13083, 8; selp.b64 %rd10997, %rd10996, %rd13080, %p2896; selp.b64 %rd13081, %rd10992, %rd13081, %p2896; selp.b64 %rd13082, %rd10994, %rd13082, %p2896; selp.b64 %rd13083, %rd10996, %rd13083, %p2896; add.s64 %rd10998, %rd13078, 8; selp.b64 %rd13084, %rd10998, %rd13084, %p2896; add.s64 %rd10999, %rd10993, 4; add.s64 %rd11000, %rd10995, 4; add.s64 %rd11001, %rd10997, 4; selp.b64 %rd13078, %rd10993, %rd10999, %p2895; selp.b64 %rd13079, %rd10995, %rd11000, %p2895; selp.b64 %rd13080, %rd10997, %rd11001, %p2895; ld.local.f32 %f5175, [%rd10995]; ld.global.f32 %f5176, [%rd10985]; setp.eq.f32 %p2897, %f5176, %f5175; mov.u64 %rd13075, %rd5212; @%p2897 bra $L__BB1_2137; mov.b32 %r5439, %f4; mov.b32 %r5438, %f5; ld.global.v2.f32 {%f5177, %f5178}, [%rd13+36]; ld.global.v2.f32 {%f5179, %f5180}, [%rd13+44]; mul.f32 %f5183, %f5180, %f5180; fma.rn.f32 %f1572, %f5179, %f5179, %f5183; mul.f32 %f5184, %f5, %f5178; fma.rn.f32 %f1573, %f4, %f5177, %f5184; setp.geu.f32 %p2898, %f1573, 0f00000000; @%p2898 bra $L__BB1_2147; ld.param.f32 %f5238, [grid_update_param_1]; setp.eq.s64 %p2899, %rd13070, 1; add.f32 %f5185, %f1572, 0f00000000; sqrt.rn.f32 %f5186, %f5185; sub.f32 %f1574, %f5186, %f5238; setp.le.f32 %p2900, %f1574, 0f00000000; or.pred %p2901, %p2899, %p2900; @%p2901 bra $L__BB1_2143; bra.uni $L__BB1_2141; $L__BB1_2143: mul.f32 %f5195, %f5177, %f1573; sub.f32 %f1575, %f4, %f5195; mov.b32 %r5439, %f1575; mul.f32 %f5196, %f5178, %f1573; sub.f32 %f1576, %f5, %f5196; mov.b32 %r5438, %f1576; mul.f32 %f5197, %f1576, %f1576; fma.rn.f32 %f5198, %f1575, %f1575, %f5197; add.f32 %f5199, %f5198, 0f00000000; sqrt.rn.f32 %f1577, %f5199; setp.leu.f32 %p2903, %f1577, 0f2EDBE6FF; @%p2903 bra $L__BB1_2147; ld.global.f32 %f5200, [%rd5189+264]; fma.rn.f32 %f5201, %f1573, %f5200, %f1577; mov.f32 %f5202, 0f00000000; max.f32 %f5203, %f5201, %f5202; div.rn.f32 %f5204, %f1575, %f1577; mul.f32 %f5205, %f5204, %f5203; mov.b32 %r5439, %f5205; div.rn.f32 %f5206, %f1576, %f1577; mul.f32 %f5207, %f5206, %f5203; mov.b32 %r5438, %f5207; bra.uni $L__BB1_2147; $L__BB1_2145: mov.b32 %r5439, %f4; mov.b32 %r5438, %f5; bra.uni $L__BB1_2147; $L__BB1_2146: mov.b32 %r4724, %f4; mov.b32 %r4723, %f5; setp.eq.s64 %p2904, %rd13070, 1; selp.b32 %r5438, 0, %r4723, %p2904; selp.b32 %r5439, 0, %r4724, %p2904; $L__BB1_2147: st.global.u32 [%rd13], %r5439; st.global.u32 [%rd13+4], %r5438; ld.global.f32 %f5208, [%rd13+12]; setp.eq.f32 %p2905, %f5208, 0f00000000; rcp.rn.f32 %f5209, %f5208; selp.f32 %f5210, 0f00000000, %f5209, %p2905; ld.global.f32 %f5211, [%rd13+8]; mul.f32 %f5212, %f5211, %f5210; st.global.f32 [%rd13+8], %f5212; $L__BB1_2148: ret; $L__BB1_2141: mov.b32 %r5439, %f4; mov.b32 %r5438, %f5; ld.param.f32 %f5239, [grid_update_param_0]; mul.f32 %f5187, %f1573, %f5239; neg.f32 %f5188, %f5187; setp.geu.f32 %p2902, %f1574, %f5188; @%p2902 bra $L__BB1_2147; ld.param.f32 %f5240, [grid_update_param_0]; div.rn.f32 %f5189, %f1574, %f5240; add.f32 %f5190, %f1573, %f5189; mul.f32 %f5191, %f5177, %f5190; mul.f32 %f5192, %f5178, %f5190; sub.f32 %f5193, %f4, %f5191; mov.b32 %r5439, %f5193; sub.f32 %f5194, %f5, %f5192; mov.b32 %r5438, %f5194; bra.uni $L__BB1_2147; $L__BB1_657: trap; $L__BB1_1128: trap; $L__BB1_1601: trap; $L__BB1_2072: trap; $L__BB1_420: trap; $L__BB1_891: trap; $L__BB1_1364: trap; $L__BB1_1835: trap; $L__BB1_538: trap; $L__BB1_660: trap; $L__BB1_662: trap; $L__BB1_1009: trap; $L__BB1_1131: trap; $L__BB1_1133: trap; $L__BB1_1482: trap; $L__BB1_1604: trap; $L__BB1_1606: trap; $L__BB1_1953: trap; $L__BB1_2075: trap; $L__BB1_2077: trap; $L__BB1_301: trap; $L__BB1_423: trap; $L__BB1_425: trap; $L__BB1_772: trap; $L__BB1_894: trap; $L__BB1_896: trap; $L__BB1_1245: trap; $L__BB1_1367: trap; $L__BB1_1369: trap; $L__BB1_1716: trap; $L__BB1_1838: trap; $L__BB1_1840: trap; $L__BB1_320: trap; $L__BB1_322: trap; $L__BB1_324: trap; $L__BB1_326: trap; $L__BB1_791: trap; $L__BB1_793: trap; $L__BB1_795: trap; $L__BB1_797: trap; $L__BB1_1264: trap; $L__BB1_1266: trap; $L__BB1_1268: trap; $L__BB1_1270: trap; $L__BB1_1735: trap; $L__BB1_1737: trap; $L__BB1_1739: trap; $L__BB1_1741: trap; $L__BB1_557: trap; $L__BB1_559: trap; $L__BB1_561: trap; $L__BB1_563: trap; $L__BB1_1028: trap; $L__BB1_1030: trap; $L__BB1_1032: trap; $L__BB1_1034: trap; $L__BB1_1501: trap; $L__BB1_1503: trap; $L__BB1_1505: trap; $L__BB1_1507: trap; $L__BB1_1972: trap; $L__BB1_1974: trap; $L__BB1_1976: trap; $L__BB1_1978: trap; $L__BB1_519: trap; $L__BB1_529: trap; $L__BB1_990: trap; $L__BB1_1000: trap; $L__BB1_1463: trap; $L__BB1_1473: trap; $L__BB1_1934: trap; $L__BB1_1944: trap; $L__BB1_282: trap; $L__BB1_292: trap; $L__BB1_753: trap; $L__BB1_763: trap; $L__BB1_1226: trap; $L__BB1_1236: trap; $L__BB1_1697: trap; $L__BB1_1707: trap; $L__BB1_2149: trap; $L__BB1_548: trap; $L__BB1_1019: trap; $L__BB1_1492: trap; $L__BB1_1963: trap; $L__BB1_64: trap; $L__BB1_183: trap; $L__BB1_186: trap; $L__BB1_188: trap; $L__BB1_208: trap; $L__BB1_60: trap; $L__BB1_48: trap; $L__BB1_74: trap; $L__BB1_83: trap; $L__BB1_85: trap; $L__BB1_87: trap; $L__BB1_89: trap; $L__BB1_171: { // callseq 0, 0 .reg .b32 temp_param_reg; call.uni _ZN4core6result13unwrap_failed17h02aadeb87602f26eE, ( ); } // callseq 0 $L__BB1_55: trap; $L__BB1_45: trap; $L__BB1_645: { // callseq 2, 0 .reg .b32 temp_param_reg; call.uni _ZN4core6result13unwrap_failed17h02aadeb87602f26eE, ( ); } // callseq 2 $L__BB1_1116: { // callseq 4, 0 .reg .b32 temp_param_reg; call.uni _ZN4core6result13unwrap_failed17h02aadeb87602f26eE, ( ); } // callseq 4 $L__BB1_1589: { // callseq 6, 0 .reg .b32 temp_param_reg; call.uni _ZN4core6result13unwrap_failed17h02aadeb87602f26eE, ( ); } // callseq 6 $L__BB1_2060: { // callseq 8, 0 .reg .b32 temp_param_reg; call.uni _ZN4core6result13unwrap_failed17h02aadeb87602f26eE, ( ); } // callseq 8 $L__BB1_445: trap; $L__BB1_297: trap; $L__BB1_285: trap; $L__BB1_311: trap; $L__BB1_408: { // callseq 1, 0 .reg .b32 temp_param_reg; call.uni _ZN4core6result13unwrap_failed17h02aadeb87602f26eE, ( ); } // callseq 1 $L__BB1_916: trap; $L__BB1_768: trap; $L__BB1_756: trap; $L__BB1_782: trap; $L__BB1_879: { // callseq 3, 0 .reg .b32 temp_param_reg; call.uni _ZN4core6result13unwrap_failed17h02aadeb87602f26eE, ( ); } // callseq 3 $L__BB1_1389: trap; $L__BB1_1241: trap; $L__BB1_1229: trap; $L__BB1_1255: trap; $L__BB1_1352: { // callseq 5, 0 .reg .b32 temp_param_reg; call.uni _ZN4core6result13unwrap_failed17h02aadeb87602f26eE, ( ); } // callseq 5 $L__BB1_1860: trap; $L__BB1_1712: trap; $L__BB1_1700: trap; $L__BB1_1726: trap; $L__BB1_1823: { // callseq 7, 0 .reg .b32 temp_param_reg; call.uni _ZN4core6result13unwrap_failed17h02aadeb87602f26eE, ( ); } // callseq 7 $L__BB1_682: trap; $L__BB1_534: trap; $L__BB1_522: trap; $L__BB1_1153: trap; $L__BB1_1005: trap; $L__BB1_993: trap; $L__BB1_1626: trap; $L__BB1_1478: trap; $L__BB1_1466: trap; $L__BB1_2097: trap; $L__BB1_1949: trap; $L__BB1_1937: trap; } // .globl reset_hashmap .visible .entry reset_hashmap( .param .align 8 .b8 reset_hashmap_param_0[16] ) { .reg .pred %p<2>; .reg .b32 %r<25>; .reg .b64 %rd<11>; ld.param.u32 %r2, [reset_hashmap_param_0+8]; ld.param.u64 %rd1, [reset_hashmap_param_0]; mov.u32 %r3, %ntid.z; mov.u32 %r4, %ntid.y; mov.u32 %r5, %ntid.x; mov.b64 %rd2, {%r5, %r4}; mov.u32 %r6, %ctaid.z; mov.u32 %r7, %nctaid.y; mov.u32 %r8, %ctaid.y; mad.lo.s32 %r9, %r6, %r7, %r8; mov.u32 %r10, %nctaid.x; mov.u32 %r11, %ctaid.x; mad.lo.s32 %r12, %r9, %r10, %r11; and.b64 %rd3, %rd2, 4294967295; cvt.u64.u32 %rd4, %r4; bfi.b64 %rd5, %rd4, %rd3, 32, 32; cvt.u64.u32 %rd6, %r3; mov.b64 {%r13, %r14}, %rd5; mov.b64 {%r15, %r16}, %rd6; mul.lo.s32 %r17, %r13, %r12; mul.lo.s32 %r18, %r17, %r14; mov.u32 %r19, %tid.z; mov.u32 %r20, %tid.y; mad.lo.s32 %r21, %r19, %r4, %r20; mov.u32 %r22, %tid.x; mad.lo.s32 %r23, %r21, %r5, %r22; mad.lo.s32 %r1, %r18, %r15, %r23; setp.ge.u32 %p1, %r1, %r2; @%p1 bra $L__BB2_2; cvta.to.global.u64 %rd7, %rd1; mul.wide.u32 %rd8, %r1, 16; add.s64 %rd9, %rd7, %rd8; mov.u64 %rd10, -1; st.global.u64 [%rd9], %rd10; mov.u32 %r24, 0; st.global.u32 [%rd9+8], %r24; $L__BB2_2: ret; } // .globl add_data_grp .visible .entry add_data_grp( .param .u64 add_data_grp_param_0, .param .u32 add_data_grp_param_1, .param .u64 add_data_grp_param_2 ) { .reg .pred %p<2>; .reg .b32 %r<9>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [add_data_grp_param_0]; ld.param.u32 %r3, [add_data_grp_param_1]; ld.param.u64 %rd2, [add_data_grp_param_2]; mov.u32 %r4, %ntid.x; mov.u32 %r1, %ctaid.x; mov.u32 %r5, %tid.x; mad.lo.s32 %r2, %r4, %r1, %r5; setp.ge.u32 %p1, %r2, %r3; @%p1 bra $L__BB3_2; cvta.to.global.u64 %rd3, %rd1; mul.wide.u32 %rd4, %r2, 4; add.s64 %rd5, %rd3, %rd4; cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.u32 %r6, [%rd5]; ld.global.u32 %r7, [%rd8]; add.s32 %r8, %r6, %r7; st.global.u32 [%rd5], %r8; $L__BB3_2: ret; } // .globl prefix_sum_512 .visible .entry prefix_sum_512( .param .u64 prefix_sum_512_param_0, .param .u32 prefix_sum_512_param_1, .param .u64 prefix_sum_512_param_2 ) { .reg .pred %p<12>; .reg .b32 %r<22>; .reg .b64 %rd<63>; // demoted variable .shared .align 4 .b8 _ZN20sparkl2d_kernels_ptx4cuda10prefix_sum14prefix_sum_51212shared_array6SHARED17hd8a2402217f77aa1E[2048]; ld.param.u64 %rd20, [prefix_sum_512_param_0]; ld.param.u32 %r5, [prefix_sum_512_param_1]; ld.param.u64 %rd21, [prefix_sum_512_param_2]; mov.u32 %r1, %ctaid.x; shl.b32 %r2, %r1, 9; setp.ge.u32 %p1, %r2, %r5; @%p1 bra $L__BB4_17; mov.u32 %r7, %tid.x; cvt.u64.u32 %rd22, %r5; cvt.u64.u32 %rd1, %r1; mul.wide.u32 %rd23, %r1, 512; sub.s64 %rd24, %rd22, %rd23; setp.lt.u64 %p2, %rd24, 2; add.s64 %rd25, %rd24, -1; mov.u64 %rd26, -1; clz.b64 %r8, %rd25; shr.u64 %rd27, %rd26, %r8; add.s64 %rd28, %rd27, 1; selp.b64 %rd29, 1, %rd28, %p2; min.u64 %rd2, %rd29, 512; max.u64 %rd3, %rd2, 1; add.s32 %r9, %r2, %r7; cvt.u64.u32 %rd4, %r9; cvt.u64.u32 %rd5, %r7; setp.ge.u32 %p3, %r9, %r5; cvta.to.global.u64 %rd30, %rd20; mul.wide.u32 %rd31, %r9, 4; add.s64 %rd6, %rd30, %rd31; mov.u32 %r21, 0; @%p3 bra $L__BB4_3; ld.global.u32 %r21, [%rd6]; $L__BB4_3: shl.b64 %rd32, %rd5, 2; mov.u64 %rd33, _ZN20sparkl2d_kernels_ptx4cuda10prefix_sum14prefix_sum_51212shared_array6SHARED17hd8a2402217f77aa1E; add.s64 %rd7, %rd33, %rd32; st.shared.u32 [%rd7], %r21; shr.u64 %rd62, %rd3, 1; setp.eq.s64 %p4, %rd62, 0; @%p4 bra $L__BB4_8; shl.b64 %rd9, %rd5, 1; mov.u64 %rd60, 1; or.b64 %rd10, %rd9, 1; mov.u64 %rd59, %rd62; $L__BB4_5: bar.sync 0; setp.le.u64 %p5, %rd59, %rd5; @%p5 bra $L__BB4_7; mul.lo.s64 %rd35, %rd60, %rd10; add.s64 %rd36, %rd35, %rd60; shl.b64 %rd37, %rd36, 2; add.s64 %rd39, %rd33, %rd37; mul.lo.s64 %rd40, %rd60, %rd9; add.s64 %rd41, %rd40, %rd60; shl.b64 %rd42, %rd41, 2; add.s64 %rd43, %rd33, %rd42; ld.shared.u32 %r10, [%rd39+-4]; ld.shared.u32 %r11, [%rd43+-4]; add.s32 %r12, %r10, %r11; st.shared.u32 [%rd39+-4], %r12; $L__BB4_7: shr.u64 %rd59, %rd59, 1; shl.b64 %rd60, %rd60, 1; setp.ne.s64 %p6, %rd59, 0; @%p6 bra $L__BB4_5; $L__BB4_8: setp.ne.s32 %p7, %r7, 0; @%p7 bra $L__BB4_10; shl.b64 %rd44, %rd3, 2; add.s64 %rd46, %rd33, %rd44; cvta.to.global.u64 %rd47, %rd21; shl.b64 %rd48, %rd1, 2; add.s64 %rd49, %rd47, %rd48; ld.shared.u32 %r14, [%rd46+-4]; st.global.u32 [%rd49], %r14; mov.u32 %r15, 0; st.shared.u32 [%rd46+-4], %r15; $L__BB4_10: setp.lt.u64 %p8, %rd2, 2; bar.sync 0; @%p8 bra $L__BB4_15; shl.b64 %rd15, %rd5, 1; mov.u64 %rd61, 1; $L__BB4_12: setp.le.u64 %p9, %rd61, %rd5; @%p9 bra $L__BB4_14; mul.lo.s64 %rd51, %rd62, %rd15; add.s64 %rd52, %rd51, %rd62; shl.b64 %rd53, %rd52, 2; add.s64 %rd55, %rd33, %rd53; add.s64 %rd56, %rd55, -4; ld.shared.u32 %r16, [%rd55+-4]; shl.b64 %rd57, %rd62, 2; add.s64 %rd58, %rd56, %rd57; ld.shared.u32 %r17, [%rd58]; st.shared.u32 [%rd55+-4], %r17; add.s32 %r18, %r17, %r16; st.shared.u32 [%rd58], %r18; $L__BB4_14: shl.b64 %rd61, %rd61, 1; shr.u64 %rd62, %rd62, 1; setp.lt.u64 %p10, %rd61, %rd3; bar.sync 0; @%p10 bra $L__BB4_12; $L__BB4_15: cvt.u32.u64 %r19, %rd4; setp.ge.u32 %p11, %r19, %r5; @%p11 bra $L__BB4_17; ld.shared.u32 %r20, [%rd7]; st.global.u32 [%rd6], %r20; $L__BB4_17: ret; } // .globl reset_grid .visible .entry reset_grid( .param .align 8 .b8 reset_grid_param_0[72] ) { .reg .pred %p<2>; .reg .f32 %f<2>; .reg .b32 %r<7>; .reg .b64 %rd<17>; ld.param.u64 %rd8, [reset_grid_param_0+64]; ld.param.u64 %rd2, [reset_grid_param_0+8]; mov.u32 %r3, %tid.y; mov.u32 %r4, %tid.x; mov.u32 %r5, %ctaid.x; mul.wide.u32 %rd9, %r5, 16; cvt.u64.u32 %rd10, %r4; add.s64 %rd11, %rd10, %rd9; mul.wide.u32 %rd12, %r3, 4; add.s64 %rd1, %rd11, %rd12; setp.le.u64 %p1, %rd8, %rd1; @%p1 bra $L__BB5_2; mul.lo.s64 %rd13, %rd1, 56; mov.u64 %rd14, 0; cvta.to.global.u64 %rd15, %rd2; add.s64 %rd16, %rd15, %rd13; mov.u32 %r6, 0; st.global.u32 [%rd16+8], %rd14; st.global.u64 [%rd16], %rd14; st.global.u32 [%rd16+12], %r6; st.global.u64 [%rd16+16], %rd14; st.global.u64 [%rd16+24], %rd14; st.global.u64 [%rd16+40], %rd14; st.global.u64 [%rd16+48], %rd14; $L__BB5_2: ret; } // .globl copy_grid_projection_data .visible .entry copy_grid_projection_data( .param .align 8 .b8 copy_grid_projection_data_param_0[72], .param .align 8 .b8 copy_grid_projection_data_param_1[72] ) { .reg .pred %p<8>; .reg .f32 %f<3>; .reg .b32 %r<11>; .reg .b64 %rd<72>; ld.param.u64 %rd31, [copy_grid_projection_data_param_1+64]; ld.param.u64 %rd26, [copy_grid_projection_data_param_1+16]; ld.param.u64 %rd25, [copy_grid_projection_data_param_1+8]; ld.param.u64 %rd24, [copy_grid_projection_data_param_0+64]; ld.param.u32 %r2, [copy_grid_projection_data_param_0+40]; ld.param.u64 %rd21, [copy_grid_projection_data_param_0+32]; ld.param.u64 %rd18, [copy_grid_projection_data_param_0+8]; cvta.to.global.u64 %rd1, %rd21; cvta.to.global.u64 %rd32, %rd26; mov.u32 %r5, %ctaid.x; mul.wide.u32 %rd33, %r5, 24; add.s64 %rd34, %rd32, %rd33; ld.global.u64 %rd2, [%rd34]; shr.u64 %rd35, %rd2, 16; xor.b64 %rd36, %rd35, %rd2; mul.lo.s64 %rd37, %rd36, 2246822507; shr.u64 %rd38, %rd37, 13; xor.b64 %rd39, %rd38, %rd37; mul.lo.s64 %rd40, %rd39, 3266489909; shr.u64 %rd41, %rd40, 16; xor.b64 %rd42, %rd41, %rd40; cvt.u64.u32 %rd43, %r2; add.s64 %rd3, %rd43, -1; and.b64 %rd68, %rd42, %rd3; shl.b64 %rd44, %rd68, 4; add.s64 %rd45, %rd1, %rd44; ld.global.u64 %rd5, [%rd45]; setp.eq.s64 %p1, %rd5, %rd2; @%p1 bra $L__BB6_5; setp.eq.s64 %p2, %rd5, -1; @%p2 bra $L__BB6_10; $L__BB6_3: add.s64 %rd46, %rd68, 1; and.b64 %rd68, %rd46, %rd3; shl.b64 %rd47, %rd68, 4; add.s64 %rd48, %rd1, %rd47; ld.global.u64 %rd8, [%rd48]; setp.eq.s64 %p3, %rd8, %rd2; @%p3 bra $L__BB6_5; setp.eq.s64 %p4, %rd8, -1; @%p4 bra $L__BB6_10; bra.uni $L__BB6_3; $L__BB6_5: shl.b64 %rd51, %rd68, 4; add.s64 %rd52, %rd1, %rd51; mul.wide.u32 %rd53, %r5, 16; mov.u32 %r7, %tid.y; mov.u32 %r8, %tid.x; mov.u64 %rd70, 0; cvt.u64.u32 %rd54, %r8; mul.wide.u32 %rd55, %r7, 4; add.s64 %rd56, %rd55, %rd54; add.s64 %rd10, %rd56, %rd53; ld.global.u32 %r9, [%rd52+8]; mul.wide.u32 %rd57, %r9, 16; add.s64 %rd58, %rd57, %rd56; setp.le.u64 %p5, %rd24, %rd58; cvta.to.global.u64 %rd59, %rd18; mul.lo.s64 %rd60, %rd58, 56; add.s64 %rd11, %rd59, %rd60; add.s64 %rd12, %rd18, %rd60; mov.u64 %rd71, %rd70; @%p5 bra $L__BB6_7; mov.u64 %rd70, %rd11; mov.u64 %rd71, %rd12; $L__BB6_7: setp.le.u64 %p6, %rd31, %rd10; @%p6 bra $L__BB6_10; setp.eq.s64 %p7, %rd71, 0; @%p7 bra $L__BB6_10; cvta.to.global.u64 %rd61, %rd25; ld.global.u32 %r10, [%rd70]; mul.lo.s64 %rd62, %rd10, 56; add.s64 %rd63, %rd61, %rd62; st.global.u32 [%rd63+20], %r10; ld.global.u64 %rd64, [%rd70+24]; ld.global.u64 %rd65, [%rd70+32]; st.global.u64 [%rd63+24], %rd64; st.global.u64 [%rd63+32], %rd65; ld.global.u64 %rd66, [%rd70+40]; st.global.u64 [%rd63+40], %rd66; ld.global.u64 %rd67, [%rd70+48]; st.global.u64 [%rd63+48], %rd67; $L__BB6_10: ret; } // .globl touch_particle_blocks .visible .entry touch_particle_blocks( .param .u64 touch_particle_blocks_param_0, .param .u32 touch_particle_blocks_param_1, .param .align 8 .b8 touch_particle_blocks_param_2[72] ) { .local .align 8 .b8 __local_depot7[48]; .reg .b64 %SP; .reg .b64 %SPL; .reg .pred %p<12>; .reg .f32 %f<14>; .reg .b32 %r<43>; .reg .b64 %rd<84>; mov.u64 %SPL, __local_depot7; ld.param.u64 %rd14, [touch_particle_blocks_param_0]; ld.param.u32 %r8, [touch_particle_blocks_param_1]; ld.param.u32 %r7, [touch_particle_blocks_param_2+40]; ld.param.u64 %rd18, [touch_particle_blocks_param_2+32]; ld.param.u64 %rd17, [touch_particle_blocks_param_2+24]; ld.param.u64 %rd16, [touch_particle_blocks_param_2+16]; ld.param.f32 %f1, [touch_particle_blocks_param_2]; mov.u32 %r9, %ntid.z; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ntid.x; mov.b64 %rd22, {%r11, %r10}; mov.u32 %r12, %ctaid.z; mov.u32 %r13, %nctaid.y; mov.u32 %r14, %ctaid.y; mad.lo.s32 %r15, %r12, %r13, %r14; mov.u32 %r16, %nctaid.x; mov.u32 %r17, %ctaid.x; mad.lo.s32 %r18, %r15, %r16, %r17; and.b64 %rd23, %rd22, 4294967295; cvt.u64.u32 %rd24, %r10; bfi.b64 %rd25, %rd24, %rd23, 32, 32; cvt.u64.u32 %rd26, %r9; mov.b64 {%r19, %r20}, %rd25; mov.b64 {%r21, %r22}, %rd26; mul.lo.s32 %r23, %r19, %r18; mul.lo.s32 %r24, %r23, %r20; mov.u32 %r25, %tid.z; mov.u32 %r26, %tid.y; mad.lo.s32 %r27, %r25, %r10, %r26; mov.u32 %r28, %tid.x; mad.lo.s32 %r29, %r27, %r11, %r28; mad.lo.s32 %r1, %r24, %r21, %r29; setp.ge.u32 %p1, %r1, %r8; @%p1 bra $L__BB7_11; cvta.to.global.u64 %rd27, %rd14; mul.wide.u32 %rd28, %r1, 8; add.s64 %rd29, %rd27, %rd28; ld.global.u32 %rd30, [%rd29]; ld.global.u32 %rd31, [%rd29+4]; bfi.b64 %rd32, %rd31, %rd30, 32, 32; mov.u64 %rd82, 0; cvt.u32.u64 %r30, %rd32; mov.b32 %f2, %r30; div.rn.f32 %f3, %f2, %f1; shr.u64 %rd34, %rd32, 32; cvt.u32.u64 %r31, %rd34; mov.b32 %f4, %r31; div.rn.f32 %f5, %f4, %f1; mov.b32 %r32, %f3; and.b32 %r33, %r32, -2147483648; or.b32 %r34, %r33, 1056964608; mov.b32 %f6, %r34; add.rz.f32 %f7, %f3, %f6; cvt.rzi.f32.f32 %f8, %f7; setp.gt.f32 %p2, %f8, 0f5EFFFFFF; max.f32 %f9, %f8, 0fDF000000; cvt.rzi.s64.f32 %rd35, %f9; setp.num.f32 %p3, %f8, %f8; mov.b32 %r35, %f5; and.b32 %r36, %r35, -2147483648; or.b32 %r37, %r36, 1056964608; mov.b32 %f10, %r37; add.rz.f32 %f11, %f5, %f10; cvt.rzi.f32.f32 %f12, %f11; setp.leu.f32 %p4, %f12, 0f5EFFFFFF; max.f32 %f13, %f12, 0fDF000000; cvt.rzi.s64.f32 %rd36, %f13; setp.num.f32 %p5, %f12, %f12; add.s64 %rd37, %rd35, 8589934590; shr.u64 %rd38, %rd37, 2; selp.b64 %rd39, 2305843011361177599, %rd38, %p2; selp.b64 %rd40, %rd39, 2147483647, %p3; shl.b64 %rd41, %rd36, 30; and.b64 %rd42, %rd40, 4294967295; add.s64 %rd43, %rd41, 9223372034707292160; and.b64 %rd44, %rd43, -4294967296; and.pred %p6, %p5, %p4; selp.b64 %rd45, %rd44, 9223372032559808512, %p6; or.b64 %rd46, %rd45, %rd42; add.s64 %rd47, %rd45, 4294967296; or.b64 %rd48, %rd47, %rd42; add.s64 %rd49, %rd40, 1; and.b64 %rd50, %rd49, 4294967295; or.b64 %rd51, %rd50, %rd45; or.b64 %rd52, %rd47, %rd50; add.u64 %rd1, %SPL, 0; st.local.u64 [%rd1], %rd46; st.local.u64 [%rd1+8], %rd48; st.local.u64 [%rd1+16], %rd51; st.local.u64 [%rd1+24], %rd52; st.local.u64 [%rd1+32], %rd82; mov.u64 %rd54, 4; st.local.u64 [%rd1+40], %rd54; add.s32 %r3, %r7, -1; setp.eq.s32 %p7, %r3, 0; @%p7 bra $L__BB7_9; cvt.u64.u32 %rd56, %r7; add.s64 %rd4, %rd56, -1; cvta.to.global.u64 %rd5, %rd16; $L__BB7_3: shl.b64 %rd59, %rd82, 3; add.s64 %rd60, %rd1, %rd59; add.s64 %rd82, %rd82, 1; st.local.u64 [%rd1+32], %rd82; ld.local.u64 %rd8, [%rd60]; shr.u64 %rd61, %rd8, 16; xor.b64 %rd62, %rd61, %rd8; mul.lo.s64 %rd63, %rd62, 2246822507; shr.u64 %rd64, %rd63, 13; xor.b64 %rd65, %rd64, %rd63; mul.lo.s64 %rd66, %rd65, 3266489909; shr.u64 %rd67, %rd66, 16; xor.b64 %rd83, %rd67, %rd66; mov.u32 %r42, 1; $L__BB7_4: and.b64 %rd11, %rd83, %rd4; shl.b64 %rd73, %rd11, 4; add.s64 %rd70, %rd18, %rd73; mov.u64 %rd71, -1; // begin inline asm cvta.to.global.u64 %rd68, %rd70;atom.global.cas.b64 %rd69, [%rd68], %rd71, %rd8; // end inline asm setp.eq.s64 %p8, %rd69, -1; @%p8 bra $L__BB7_7; setp.eq.s64 %p9, %rd69, %rd8; @%p9 bra $L__BB7_8; add.s64 %rd83, %rd11, 1; add.s32 %r5, %r42, 1; setp.lt.u32 %p10, %r42, %r3; mov.u32 %r42, %r5; @%p10 bra $L__BB7_4; bra.uni $L__BB7_8; $L__BB7_7: cvta.to.global.u64 %rd76, %rd18; mov.u32 %r40, 1; // begin inline asm cvta.to.global.u64 %rd74, %rd17;atom.global.add.u32 %r39, [%rd74], %r40; // end inline asm mul.wide.u32 %rd77, %r39, 24; add.s64 %rd78, %rd5, %rd77; st.global.u64 [%rd78], %rd8; mov.u32 %r41, 0; st.global.v2.u32 [%rd78+8], {%r41, %r41}; st.global.u32 [%rd78+16], %r41; add.s64 %rd80, %rd76, %rd73; st.global.u32 [%rd80+8], %r39; $L__BB7_8: setp.lt.u64 %p11, %rd82, 4; @%p11 bra $L__BB7_3; bra.uni $L__BB7_11; $L__BB7_9: st.local.u64 [%rd1+32], %rd54; $L__BB7_11: ret; } // .globl tag_halo_blocks .visible .entry tag_halo_blocks( .param .align 8 .b8 tag_halo_blocks_param_0[72], .param .u64 tag_halo_blocks_param_1, .param .u32 tag_halo_blocks_param_2, .param .u64 tag_halo_blocks_param_3 ) { .reg .pred %p<7>; .reg .f32 %f<2>; .reg .b32 %r<31>; .reg .b64 %rd<51>; ld.param.u64 %rd17, [tag_halo_blocks_param_1]; ld.param.u32 %r4, [tag_halo_blocks_param_2]; ld.param.u64 %rd18, [tag_halo_blocks_param_3]; ld.param.u32 %r3, [tag_halo_blocks_param_0+40]; ld.param.u64 %rd13, [tag_halo_blocks_param_0+32]; ld.param.u64 %rd11, [tag_halo_blocks_param_0+16]; mov.u32 %r5, %ntid.z; mov.u32 %r6, %ntid.y; mov.u32 %r7, %ntid.x; mov.b64 %rd19, {%r7, %r6}; mov.u32 %r8, %ctaid.z; mov.u32 %r9, %nctaid.y; mov.u32 %r10, %ctaid.y; mad.lo.s32 %r11, %r8, %r9, %r10; mov.u32 %r12, %nctaid.x; mov.u32 %r13, %ctaid.x; mad.lo.s32 %r14, %r11, %r12, %r13; and.b64 %rd20, %rd19, 4294967295; cvt.u64.u32 %rd21, %r6; bfi.b64 %rd22, %rd21, %rd20, 32, 32; cvt.u64.u32 %rd23, %r5; mov.b64 {%r15, %r16}, %rd22; mov.b64 {%r17, %r18}, %rd23; mul.lo.s32 %r19, %r15, %r14; mul.lo.s32 %r20, %r19, %r16; mov.u32 %r21, %tid.z; mov.u32 %r22, %tid.y; mad.lo.s32 %r23, %r21, %r6, %r22; mov.u32 %r24, %tid.x; mad.lo.s32 %r25, %r23, %r7, %r24; mad.lo.s32 %r1, %r20, %r17, %r25; setp.ge.u32 %p1, %r1, %r4; @%p1 bra $L__BB8_8; cvta.to.global.u64 %rd24, %rd17; cvta.to.global.u64 %rd1, %rd13; mul.wide.u32 %rd25, %r1, 24; add.s64 %rd26, %rd24, %rd25; ld.global.u64 %rd2, [%rd26]; shr.u64 %rd27, %rd2, 16; xor.b64 %rd28, %rd27, %rd2; mul.lo.s64 %rd29, %rd28, 2246822507; shr.u64 %rd30, %rd29, 13; xor.b64 %rd31, %rd30, %rd29; mul.lo.s64 %rd32, %rd31, 3266489909; shr.u64 %rd33, %rd32, 16; xor.b64 %rd34, %rd33, %rd32; cvt.u64.u32 %rd35, %r3; add.s64 %rd3, %rd35, -1; and.b64 %rd49, %rd34, %rd3; shl.b64 %rd36, %rd49, 4; add.s64 %rd37, %rd1, %rd36; ld.global.u64 %rd5, [%rd37]; setp.eq.s64 %p2, %rd5, %rd2; @%p2 bra $L__BB8_6; setp.eq.s64 %p3, %rd5, -1; @%p3 bra $L__BB8_8; $L__BB8_4: add.s64 %rd38, %rd49, 1; and.b64 %rd49, %rd38, %rd3; shl.b64 %rd39, %rd49, 4; add.s64 %rd40, %rd1, %rd39; ld.global.u64 %rd8, [%rd40]; setp.eq.s64 %p4, %rd8, %rd2; @%p4 bra $L__BB8_6; setp.eq.s64 %p5, %rd8, -1; @%p5 bra $L__BB8_8; bra.uni $L__BB8_4; $L__BB8_6: shl.b64 %rd43, %rd49, 4; add.s64 %rd44, %rd1, %rd43; ld.global.u32 %r28, [%rd44+8]; mul.wide.u32 %rd45, %r28, 24; add.s64 %rd46, %rd11, %rd45; add.s64 %rd42, %rd46, 16; mov.u32 %r27, 1; // begin inline asm cvta.to.global.u64 %rd41, %rd42;atom.global.exch.b32 %r26, [%rd41], %r27; // end inline asm setp.ne.s32 %p6, %r26, 0; @%p6 bra $L__BB8_8; // begin inline asm cvta.to.global.u64 %rd47, %rd18;atom.global.add.u32 %r29, [%rd47], %r27; // end inline asm $L__BB8_8: ret; } // .globl tag_halo_neighbors .visible .entry tag_halo_neighbors( .param .align 8 .b8 tag_halo_neighbors_param_0[72], .param .u32 tag_halo_neighbors_param_1 ) { .reg .pred %p<18>; .reg .f32 %f<2>; .reg .b32 %r<37>; .reg .b64 %rd<104>; ld.param.u32 %r4, [tag_halo_neighbors_param_1]; ld.param.u32 %r3, [tag_halo_neighbors_param_0+40]; ld.param.u64 %rd29, [tag_halo_neighbors_param_0+32]; ld.param.u64 %rd27, [tag_halo_neighbors_param_0+16]; cvta.to.global.u64 %rd1, %rd29; mov.u32 %r5, %ntid.z; mov.u32 %r6, %ntid.y; mov.u32 %r7, %ntid.x; mov.b64 %rd33, {%r7, %r6}; mov.u32 %r8, %ctaid.z; mov.u32 %r9, %nctaid.y; mov.u32 %r10, %ctaid.y; mad.lo.s32 %r11, %r8, %r9, %r10; mov.u32 %r12, %nctaid.x; mov.u32 %r13, %ctaid.x; mad.lo.s32 %r14, %r11, %r12, %r13; and.b64 %rd34, %rd33, 4294967295; cvt.u64.u32 %rd35, %r6; bfi.b64 %rd36, %rd35, %rd34, 32, 32; cvt.u64.u32 %rd37, %r5; mov.b64 {%r15, %r16}, %rd36; mov.b64 {%r17, %r18}, %rd37; mul.lo.s32 %r19, %r15, %r14; mul.lo.s32 %r20, %r19, %r16; mov.u32 %r21, %tid.z; mov.u32 %r22, %tid.y; mad.lo.s32 %r23, %r21, %r6, %r22; mov.u32 %r24, %tid.x; mad.lo.s32 %r25, %r23, %r7, %r24; mad.lo.s32 %r1, %r20, %r17, %r25; setp.ge.u32 %p1, %r1, %r4; @%p1 bra $L__BB9_20; cvta.to.global.u64 %rd2, %rd27; mul.wide.u32 %rd38, %r1, 24; add.s64 %rd39, %rd2, %rd38; add.s64 %rd3, %rd39, 16; ld.global.u32 %r26, [%rd39+16]; and.b32 %r27, %r26, 1; setp.eq.b32 %p2, %r27, 1; mov.pred %p3, 0; xor.pred %p4, %p2, %p3; not.pred %p5, %p4; @%p5 bra $L__BB9_20; ld.global.u64 %rd40, [%rd3+-16]; and.b64 %rd41, %rd40, 4294967295; and.b64 %rd42, %rd40, -4294967296; add.s64 %rd43, %rd42, -4294967296; or.b64 %rd4, %rd43, %rd41; add.s64 %rd44, %rd40, -1; and.b64 %rd45, %rd44, 4294967295; or.b64 %rd5, %rd45, %rd42; or.b64 %rd6, %rd45, %rd43; cvt.u64.u32 %rd46, %r3; add.s64 %rd7, %rd46, -1; shr.u64 %rd47, %rd4, 16; xor.b64 %rd48, %rd47, %rd4; mul.lo.s64 %rd49, %rd48, 2246822507; shr.u64 %rd50, %rd49, 13; xor.b64 %rd51, %rd50, %rd49; mul.lo.s64 %rd52, %rd51, 3266489909; shr.u64 %rd53, %rd52, 16; xor.b64 %rd54, %rd53, %rd52; and.b64 %rd98, %rd54, %rd7; shl.b64 %rd55, %rd98, 4; add.s64 %rd56, %rd1, %rd55; ld.global.u64 %rd9, [%rd56]; setp.eq.s64 %p6, %rd9, %rd4; @%p6 bra $L__BB9_7; setp.eq.s64 %p7, %rd9, -1; @%p7 bra $L__BB9_8; $L__BB9_5: add.s64 %rd57, %rd98, 1; and.b64 %rd98, %rd57, %rd7; shl.b64 %rd58, %rd98, 4; add.s64 %rd59, %rd1, %rd58; ld.global.u64 %rd12, [%rd59]; setp.eq.s64 %p8, %rd12, %rd4; @%p8 bra $L__BB9_7; setp.eq.s64 %p9, %rd12, -1; @%p9 bra $L__BB9_8; bra.uni $L__BB9_5; $L__BB9_7: shl.b64 %rd60, %rd98, 4; add.s64 %rd61, %rd1, %rd60; ld.global.u32 %r28, [%rd61+8]; mul.wide.u32 %rd62, %r28, 24; add.s64 %rd63, %rd2, %rd62; ld.global.u32 %r29, [%rd63+16]; or.b32 %r30, %r29, 2; st.global.u32 [%rd63+16], %r30; $L__BB9_8: shr.u64 %rd64, %rd5, 16; xor.b64 %rd65, %rd64, %rd5; mul.lo.s64 %rd66, %rd65, 2246822507; shr.u64 %rd67, %rd66, 13; xor.b64 %rd68, %rd67, %rd66; mul.lo.s64 %rd69, %rd68, 3266489909; shr.u64 %rd70, %rd69, 16; xor.b64 %rd71, %rd70, %rd69; and.b64 %rd100, %rd71, %rd7; shl.b64 %rd72, %rd100, 4; add.s64 %rd73, %rd1, %rd72; ld.global.u64 %rd15, [%rd73]; setp.eq.s64 %p10, %rd15, %rd5; @%p10 bra $L__BB9_13; setp.eq.s64 %p11, %rd15, -1; @%p11 bra $L__BB9_14; $L__BB9_11: add.s64 %rd74, %rd100, 1; and.b64 %rd100, %rd74, %rd7; shl.b64 %rd75, %rd100, 4; add.s64 %rd76, %rd1, %rd75; ld.global.u64 %rd18, [%rd76]; setp.eq.s64 %p12, %rd18, %rd5; @%p12 bra $L__BB9_13; setp.eq.s64 %p13, %rd18, -1; @%p13 bra $L__BB9_14; bra.uni $L__BB9_11; $L__BB9_13: shl.b64 %rd77, %rd100, 4; add.s64 %rd78, %rd1, %rd77; ld.global.u32 %r31, [%rd78+8]; mul.wide.u32 %rd79, %r31, 24; add.s64 %rd80, %rd2, %rd79; ld.global.u32 %r32, [%rd80+16]; or.b32 %r33, %r32, 2; st.global.u32 [%rd80+16], %r33; $L__BB9_14: shr.u64 %rd81, %rd6, 16; xor.b64 %rd82, %rd81, %rd6; mul.lo.s64 %rd83, %rd82, 2246822507; shr.u64 %rd84, %rd83, 13; xor.b64 %rd85, %rd84, %rd83; mul.lo.s64 %rd86, %rd85, 3266489909; shr.u64 %rd87, %rd86, 16; xor.b64 %rd88, %rd87, %rd86; and.b64 %rd102, %rd88, %rd7; shl.b64 %rd89, %rd102, 4; add.s64 %rd90, %rd1, %rd89; ld.global.u64 %rd21, [%rd90]; setp.eq.s64 %p14, %rd21, %rd6; @%p14 bra $L__BB9_19; setp.eq.s64 %p15, %rd21, -1; @%p15 bra $L__BB9_20; $L__BB9_17: add.s64 %rd91, %rd102, 1; and.b64 %rd102, %rd91, %rd7; shl.b64 %rd92, %rd102, 4; add.s64 %rd93, %rd1, %rd92; ld.global.u64 %rd24, [%rd93]; setp.eq.s64 %p16, %rd24, %rd6; @%p16 bra $L__BB9_19; setp.eq.s64 %p17, %rd24, -1; @%p17 bra $L__BB9_20; bra.uni $L__BB9_17; $L__BB9_19: shl.b64 %rd94, %rd102, 4; add.s64 %rd95, %rd1, %rd94; ld.global.u32 %r34, [%rd95+8]; mul.wide.u32 %rd96, %r34, 24; add.s64 %rd97, %rd2, %rd96; ld.global.u32 %r35, [%rd97+16]; or.b32 %r36, %r35, 2; st.global.u32 [%rd97+16], %r36; $L__BB9_20: ret; } // .globl copy_halo_to_staging .visible .entry copy_halo_to_staging( .param .align 8 .b8 copy_halo_to_staging_param_0[72], .param .u64 copy_halo_to_staging_param_1, .param .u64 copy_halo_to_staging_param_2 ) { .reg .pred %p<6>; .reg .f32 %f<320>; .reg .b32 %r<31>; .reg .b64 %rd<62>; ld.param.u64 %rd10, [copy_halo_to_staging_param_1]; ld.param.u64 %rd11, [copy_halo_to_staging_param_2]; ld.param.u64 %rd5, [copy_halo_to_staging_param_0+24]; ld.param.u64 %rd4, [copy_halo_to_staging_param_0+16]; ld.param.u64 %rd3, [copy_halo_to_staging_param_0+8]; cvta.to.global.u64 %rd12, %rd5; mov.u32 %r4, %ntid.z; mov.u32 %r5, %ntid.y; mov.u32 %r6, %ntid.x; mov.b64 %rd13, {%r6, %r5}; mov.u32 %r7, %ctaid.z; mov.u32 %r8, %nctaid.y; mov.u32 %r9, %ctaid.y; mad.lo.s32 %r10, %r7, %r8, %r9; mov.u32 %r11, %nctaid.x; mov.u32 %r12, %ctaid.x; mad.lo.s32 %r13, %r10, %r11, %r12; and.b64 %rd14, %rd13, 4294967295; cvt.u64.u32 %rd15, %r5; bfi.b64 %rd16, %rd15, %rd14, 32, 32; cvt.u64.u32 %rd17, %r4; mov.b64 {%r14, %r15}, %rd16; mov.b64 {%r16, %r17}, %rd17; mul.lo.s32 %r18, %r14, %r13; mul.lo.s32 %r19, %r18, %r15; mov.u32 %r20, %tid.z; mov.u32 %r21, %tid.y; mad.lo.s32 %r22, %r20, %r5, %r21; mov.u32 %r23, %tid.x; mad.lo.s32 %r24, %r22, %r6, %r23; mad.lo.s32 %r1, %r19, %r16, %r24; ld.global.u32 %r25, [%rd12]; setp.ge.u32 %p1, %r1, %r25; @%p1 bra $L__BB10_3; cvta.to.global.u64 %rd18, %rd4; cvt.u64.u32 %rd1, %r1; mul.wide.u32 %rd19, %r1, 24; add.s64 %rd20, %rd18, %rd19; add.s64 %rd2, %rd20, 16; ld.global.u32 %r26, [%rd20+16]; and.b32 %r27, %r26, 1; setp.eq.b32 %p2, %r27, 1; mov.pred %p3, 0; xor.pred %p4, %p2, %p3; not.pred %p5, %p4; @%p5 bra $L__BB10_3; mov.u32 %r29, -1; // begin inline asm cvta.to.global.u64 %rd21, %rd11;atom.global.dec.u32 %r28, [%rd21], %r29; // end inline asm add.s32 %r30, %r28, -1; cvta.to.global.u64 %rd23, %rd10; mul.wide.u32 %rd24, %r30, 904; add.s64 %rd25, %rd23, %rd24; ld.global.u64 %rd26, [%rd2+-16]; st.global.u64 [%rd25], %rd26; mul.lo.s64 %rd27, %rd1, 896; cvta.to.global.u64 %rd28, %rd3; add.s64 %rd29, %rd28, %rd27; ld.global.v2.f32 {%f2, %f3}, [%rd29]; ld.global.v2.f32 {%f6, %f7}, [%rd29+8]; ld.global.v2.f32 {%f10, %f11}, [%rd29+16]; ld.global.u64 %rd30, [%rd29+24]; ld.global.u64 %rd31, [%rd29+32]; ld.global.v2.f32 {%f14, %f15}, [%rd29+40]; ld.global.v2.f32 {%f18, %f19}, [%rd29+48]; st.global.v2.f32 [%rd25+8], {%f2, %f3}; st.global.v2.f32 [%rd25+16], {%f6, %f7}; st.global.v2.f32 [%rd25+24], {%f10, %f11}; st.global.u64 [%rd25+32], %rd30; st.global.u64 [%rd25+40], %rd31; st.global.v2.f32 [%rd25+48], {%f14, %f15}; st.global.v2.f32 [%rd25+56], {%f18, %f19}; ld.global.v2.f32 {%f22, %f23}, [%rd29+56]; ld.global.v2.f32 {%f26, %f27}, [%rd29+64]; ld.global.v2.f32 {%f30, %f31}, [%rd29+72]; ld.global.u64 %rd32, [%rd29+80]; ld.global.u64 %rd33, [%rd29+88]; ld.global.v2.f32 {%f34, %f35}, [%rd29+96]; ld.global.v2.f32 {%f38, %f39}, [%rd29+104]; st.global.v2.f32 [%rd25+64], {%f22, %f23}; st.global.v2.f32 [%rd25+72], {%f26, %f27}; st.global.v2.f32 [%rd25+80], {%f30, %f31}; st.global.u64 [%rd25+88], %rd32; st.global.u64 [%rd25+96], %rd33; st.global.v2.f32 [%rd25+104], {%f34, %f35}; st.global.v2.f32 [%rd25+112], {%f38, %f39}; ld.global.v2.f32 {%f42, %f43}, [%rd29+112]; ld.global.v2.f32 {%f46, %f47}, [%rd29+120]; ld.global.v2.f32 {%f50, %f51}, [%rd29+128]; ld.global.u64 %rd34, [%rd29+136]; ld.global.u64 %rd35, [%rd29+144]; ld.global.v2.f32 {%f54, %f55}, [%rd29+152]; ld.global.v2.f32 {%f58, %f59}, [%rd29+160]; st.global.v2.f32 [%rd25+120], {%f42, %f43}; st.global.v2.f32 [%rd25+128], {%f46, %f47}; st.global.v2.f32 [%rd25+136], {%f50, %f51}; st.global.u64 [%rd25+144], %rd34; st.global.u64 [%rd25+152], %rd35; st.global.v2.f32 [%rd25+160], {%f54, %f55}; st.global.v2.f32 [%rd25+168], {%f58, %f59}; ld.global.v2.f32 {%f62, %f63}, [%rd29+168]; ld.global.v2.f32 {%f66, %f67}, [%rd29+176]; ld.global.v2.f32 {%f70, %f71}, [%rd29+184]; ld.global.u64 %rd36, [%rd29+192]; ld.global.u64 %rd37, [%rd29+200]; ld.global.v2.f32 {%f74, %f75}, [%rd29+208]; ld.global.v2.f32 {%f78, %f79}, [%rd29+216]; st.global.v2.f32 [%rd25+176], {%f62, %f63}; st.global.v2.f32 [%rd25+184], {%f66, %f67}; st.global.v2.f32 [%rd25+192], {%f70, %f71}; st.global.u64 [%rd25+200], %rd36; st.global.u64 [%rd25+208], %rd37; st.global.v2.f32 [%rd25+216], {%f74, %f75}; st.global.v2.f32 [%rd25+224], {%f78, %f79}; ld.global.v2.f32 {%f82, %f83}, [%rd29+224]; ld.global.v2.f32 {%f86, %f87}, [%rd29+232]; ld.global.v2.f32 {%f90, %f91}, [%rd29+240]; ld.global.u64 %rd38, [%rd29+248]; ld.global.u64 %rd39, [%rd29+256]; ld.global.v2.f32 {%f94, %f95}, [%rd29+264]; ld.global.v2.f32 {%f98, %f99}, [%rd29+272]; st.global.v2.f32 [%rd25+232], {%f82, %f83}; st.global.v2.f32 [%rd25+240], {%f86, %f87}; st.global.v2.f32 [%rd25+248], {%f90, %f91}; st.global.u64 [%rd25+256], %rd38; st.global.u64 [%rd25+264], %rd39; st.global.v2.f32 [%rd25+272], {%f94, %f95}; st.global.v2.f32 [%rd25+280], {%f98, %f99}; ld.global.v2.f32 {%f102, %f103}, [%rd29+280]; ld.global.f32 %f106, [%rd29+288]; ld.global.f32 %f107, [%rd29+292]; ld.global.v2.f32 {%f108, %f109}, [%rd29+296]; ld.global.u64 %rd40, [%rd29+304]; ld.global.u64 %rd41, [%rd29+312]; ld.global.v2.f32 {%f112, %f113}, [%rd29+320]; ld.global.v2.f32 {%f116, %f117}, [%rd29+328]; st.global.v2.f32 [%rd25+288], {%f102, %f103}; st.global.f32 [%rd25+296], %f106; st.global.f32 [%rd25+300], %f107; st.global.f32 [%rd25+304], %f108; st.global.f32 [%rd25+308], %f109; st.global.u64 [%rd25+312], %rd40; st.global.u64 [%rd25+320], %rd41; st.global.v2.f32 [%rd25+328], {%f112, %f113}; st.global.v2.f32 [%rd25+336], {%f116, %f117}; ld.global.v2.f32 {%f120, %f121}, [%rd29+336]; ld.global.v2.f32 {%f124, %f125}, [%rd29+344]; ld.global.v2.f32 {%f128, %f129}, [%rd29+352]; ld.global.u64 %rd42, [%rd29+360]; ld.global.u64 %rd43, [%rd29+368]; ld.global.v2.f32 {%f132, %f133}, [%rd29+376]; ld.global.v2.f32 {%f136, %f137}, [%rd29+384]; st.global.v2.f32 [%rd25+344], {%f120, %f121}; st.global.v2.f32 [%rd25+352], {%f124, %f125}; st.global.v2.f32 [%rd25+360], {%f128, %f129}; st.global.u64 [%rd25+368], %rd42; st.global.u64 [%rd25+376], %rd43; st.global.v2.f32 [%rd25+384], {%f132, %f133}; st.global.v2.f32 [%rd25+392], {%f136, %f137}; ld.global.v2.f32 {%f140, %f141}, [%rd29+392]; ld.global.v2.f32 {%f144, %f145}, [%rd29+400]; ld.global.v2.f32 {%f148, %f149}, [%rd29+408]; ld.global.u64 %rd44, [%rd29+416]; ld.global.u64 %rd45, [%rd29+424]; ld.global.v2.f32 {%f152, %f153}, [%rd29+432]; ld.global.v2.f32 {%f156, %f157}, [%rd29+440]; st.global.v2.f32 [%rd25+400], {%f140, %f141}; st.global.v2.f32 [%rd25+408], {%f144, %f145}; st.global.v2.f32 [%rd25+416], {%f148, %f149}; st.global.u64 [%rd25+424], %rd44; st.global.u64 [%rd25+432], %rd45; st.global.v2.f32 [%rd25+440], {%f152, %f153}; st.global.v2.f32 [%rd25+448], {%f156, %f157}; ld.global.v2.f32 {%f160, %f161}, [%rd29+448]; ld.global.v2.f32 {%f164, %f165}, [%rd29+456]; ld.global.v2.f32 {%f168, %f169}, [%rd29+464]; ld.global.u64 %rd46, [%rd29+472]; ld.global.u64 %rd47, [%rd29+480]; ld.global.v2.f32 {%f172, %f173}, [%rd29+488]; ld.global.v2.f32 {%f176, %f177}, [%rd29+496]; st.global.v2.f32 [%rd25+456], {%f160, %f161}; st.global.v2.f32 [%rd25+464], {%f164, %f165}; st.global.v2.f32 [%rd25+472], {%f168, %f169}; st.global.u64 [%rd25+480], %rd46; st.global.u64 [%rd25+488], %rd47; st.global.v2.f32 [%rd25+496], {%f172, %f173}; st.global.v2.f32 [%rd25+504], {%f176, %f177}; ld.global.v2.f32 {%f180, %f181}, [%rd29+504]; ld.global.v2.f32 {%f184, %f185}, [%rd29+512]; ld.global.v2.f32 {%f188, %f189}, [%rd29+520]; ld.global.u64 %rd48, [%rd29+528]; ld.global.u64 %rd49, [%rd29+536]; ld.global.v2.f32 {%f192, %f193}, [%rd29+544]; ld.global.v2.f32 {%f196, %f197}, [%rd29+552]; st.global.v2.f32 [%rd25+512], {%f180, %f181}; st.global.v2.f32 [%rd25+520], {%f184, %f185}; st.global.v2.f32 [%rd25+528], {%f188, %f189}; st.global.u64 [%rd25+536], %rd48; st.global.u64 [%rd25+544], %rd49; st.global.v2.f32 [%rd25+552], {%f192, %f193}; st.global.v2.f32 [%rd25+560], {%f196, %f197}; ld.global.v2.f32 {%f200, %f201}, [%rd29+560]; ld.global.v2.f32 {%f204, %f205}, [%rd29+568]; ld.global.v2.f32 {%f208, %f209}, [%rd29+576]; ld.global.u64 %rd50, [%rd29+584]; ld.global.u64 %rd51, [%rd29+592]; ld.global.v2.f32 {%f212, %f213}, [%rd29+600]; ld.global.v2.f32 {%f216, %f217}, [%rd29+608]; st.global.v2.f32 [%rd25+568], {%f200, %f201}; st.global.v2.f32 [%rd25+576], {%f204, %f205}; st.global.v2.f32 [%rd25+584], {%f208, %f209}; st.global.u64 [%rd25+592], %rd50; st.global.u64 [%rd25+600], %rd51; st.global.v2.f32 [%rd25+608], {%f212, %f213}; st.global.v2.f32 [%rd25+616], {%f216, %f217}; ld.global.v2.f32 {%f220, %f221}, [%rd29+616]; ld.global.v2.f32 {%f224, %f225}, [%rd29+624]; ld.global.v2.f32 {%f228, %f229}, [%rd29+632]; ld.global.u64 %rd52, [%rd29+640]; ld.global.u64 %rd53, [%rd29+648]; ld.global.v2.f32 {%f232, %f233}, [%rd29+656]; ld.global.v2.f32 {%f236, %f237}, [%rd29+664]; st.global.v2.f32 [%rd25+624], {%f220, %f221}; st.global.v2.f32 [%rd25+632], {%f224, %f225}; st.global.v2.f32 [%rd25+640], {%f228, %f229}; st.global.u64 [%rd25+648], %rd52; st.global.u64 [%rd25+656], %rd53; st.global.v2.f32 [%rd25+664], {%f232, %f233}; st.global.v2.f32 [%rd25+672], {%f236, %f237}; ld.global.v2.f32 {%f240, %f241}, [%rd29+672]; ld.global.v2.f32 {%f244, %f245}, [%rd29+680]; ld.global.v2.f32 {%f248, %f249}, [%rd29+688]; ld.global.u64 %rd54, [%rd29+696]; ld.global.u64 %rd55, [%rd29+704]; ld.global.v2.f32 {%f252, %f253}, [%rd29+712]; ld.global.v2.f32 {%f256, %f257}, [%rd29+720]; st.global.v2.f32 [%rd25+680], {%f240, %f241}; st.global.v2.f32 [%rd25+688], {%f244, %f245}; st.global.v2.f32 [%rd25+696], {%f248, %f249}; st.global.u64 [%rd25+704], %rd54; st.global.u64 [%rd25+712], %rd55; st.global.v2.f32 [%rd25+720], {%f252, %f253}; st.global.v2.f32 [%rd25+728], {%f256, %f257}; ld.global.v2.f32 {%f260, %f261}, [%rd29+728]; ld.global.v2.f32 {%f264, %f265}, [%rd29+736]; ld.global.v2.f32 {%f268, %f269}, [%rd29+744]; ld.global.u64 %rd56, [%rd29+752]; ld.global.u64 %rd57, [%rd29+760]; ld.global.v2.f32 {%f272, %f273}, [%rd29+768]; ld.global.v2.f32 {%f276, %f277}, [%rd29+776]; st.global.v2.f32 [%rd25+736], {%f260, %f261}; st.global.v2.f32 [%rd25+744], {%f264, %f265}; st.global.v2.f32 [%rd25+752], {%f268, %f269}; st.global.u64 [%rd25+760], %rd56; st.global.u64 [%rd25+768], %rd57; st.global.v2.f32 [%rd25+776], {%f272, %f273}; st.global.v2.f32 [%rd25+784], {%f276, %f277}; ld.global.v2.f32 {%f280, %f281}, [%rd29+784]; ld.global.v2.f32 {%f284, %f285}, [%rd29+792]; ld.global.v2.f32 {%f288, %f289}, [%rd29+800]; ld.global.u64 %rd58, [%rd29+808]; ld.global.u64 %rd59, [%rd29+816]; ld.global.v2.f32 {%f292, %f293}, [%rd29+824]; ld.global.v2.f32 {%f296, %f297}, [%rd29+832]; st.global.v2.f32 [%rd25+792], {%f280, %f281}; st.global.v2.f32 [%rd25+800], {%f284, %f285}; st.global.v2.f32 [%rd25+808], {%f288, %f289}; st.global.u64 [%rd25+816], %rd58; st.global.u64 [%rd25+824], %rd59; st.global.v2.f32 [%rd25+832], {%f292, %f293}; st.global.v2.f32 [%rd25+840], {%f296, %f297}; ld.global.v2.f32 {%f300, %f301}, [%rd29+840]; ld.global.v2.f32 {%f304, %f305}, [%rd29+848]; ld.global.v2.f32 {%f308, %f309}, [%rd29+856]; ld.global.u64 %rd60, [%rd29+864]; ld.global.u64 %rd61, [%rd29+872]; ld.global.v2.f32 {%f312, %f313}, [%rd29+880]; ld.global.v2.f32 {%f316, %f317}, [%rd29+888]; st.global.v2.f32 [%rd25+848], {%f300, %f301}; st.global.v2.f32 [%rd25+856], {%f304, %f305}; st.global.v2.f32 [%rd25+864], {%f308, %f309}; st.global.u64 [%rd25+872], %rd60; st.global.u64 [%rd25+880], %rd61; st.global.v2.f32 [%rd25+888], {%f312, %f313}; st.global.f32 [%rd25+896], %f316; st.global.f32 [%rd25+900], %f317; $L__BB10_3: ret; } // .globl merge_halo_blocks .visible .entry merge_halo_blocks( .param .align 8 .b8 merge_halo_blocks_param_0[72], .param .u64 merge_halo_blocks_param_1 ) { .reg .pred %p<7>; .reg .f32 %f<2>; .reg .b32 %r<13>; .reg .b64 %rd<66>; ld.param.u64 %rd21, [merge_halo_blocks_param_1]; ld.param.u64 %rd20, [merge_halo_blocks_param_0+64]; ld.param.u32 %r2, [merge_halo_blocks_param_0+40]; ld.param.u64 %rd17, [merge_halo_blocks_param_0+32]; ld.param.u64 %rd14, [merge_halo_blocks_param_0+8]; cvta.to.global.u64 %rd22, %rd21; cvta.to.global.u64 %rd1, %rd17; mov.u32 %r3, %ctaid.x; mul.wide.u32 %rd23, %r3, 904; add.s64 %rd24, %rd22, %rd23; ld.global.u64 %rd2, [%rd24]; shr.u64 %rd25, %rd2, 16; xor.b64 %rd26, %rd25, %rd2; mul.lo.s64 %rd27, %rd26, 2246822507; shr.u64 %rd28, %rd27, 13; xor.b64 %rd29, %rd28, %rd27; mul.lo.s64 %rd30, %rd29, 3266489909; shr.u64 %rd31, %rd30, 16; xor.b64 %rd32, %rd31, %rd30; cvt.u64.u32 %rd33, %r2; add.s64 %rd3, %rd33, -1; and.b64 %rd64, %rd32, %rd3; shl.b64 %rd34, %rd64, 4; add.s64 %rd35, %rd1, %rd34; ld.global.u64 %rd5, [%rd35]; setp.eq.s64 %p1, %rd5, %rd2; @%p1 bra $L__BB11_5; setp.eq.s64 %p2, %rd5, -1; @%p2 bra $L__BB11_10; $L__BB11_3: add.s64 %rd36, %rd64, 1; and.b64 %rd64, %rd36, %rd3; shl.b64 %rd37, %rd64, 4; add.s64 %rd38, %rd1, %rd37; ld.global.u64 %rd8, [%rd38]; setp.eq.s64 %p3, %rd8, %rd2; @%p3 bra $L__BB11_5; setp.eq.s64 %p4, %rd8, -1; @%p4 bra $L__BB11_10; bra.uni $L__BB11_3; $L__BB11_5: shl.b64 %rd39, %rd64, 4; add.s64 %rd40, %rd1, %rd39; ld.global.u32 %r4, [%rd40+8]; mul.wide.u32 %rd41, %r4, 16; mov.u32 %r5, %tid.x; cvt.u64.u32 %rd10, %r5; add.s64 %rd11, %rd41, %rd10; setp.gt.u64 %p5, %rd20, %rd11; @%p5 bra $L__BB11_7; bra.uni $L__BB11_6; $L__BB11_7: mul.lo.s64 %rd42, %rd11, 56; add.s64 %rd13, %rd14, %rd42; setp.lt.u32 %p6, %r5, 16; @%p6 bra $L__BB11_9; bra.uni $L__BB11_8; $L__BB11_9: mul.lo.s64 %rd56, %rd10, 56; add.s64 %rd57, %rd24, %rd56; ld.global.u32 %r7, [%rd57+8]; // begin inline asm cvta.to.global.u64 %rd43, %rd13;red.global.add.f32 [%rd43], %r7; // end inline asm add.s64 %rd46, %rd13, 4; ld.global.u32 %rd60, [%rd57+12]; ld.global.u32 %rd61, [%rd57+16]; bfi.b64 %rd62, %rd61, %rd60, 32, 32; cvt.u32.u64 %r8, %rd62; shr.u64 %rd63, %rd62, 32; cvt.u32.u64 %r9, %rd63; // begin inline asm cvta.to.global.u64 %rd45, %rd46;red.global.add.f32 [%rd45], %r8; // end inline asm add.s64 %rd48, %rd13, 8; // begin inline asm cvta.to.global.u64 %rd47, %rd48;red.global.add.f32 [%rd47], %r9; // end inline asm add.s64 %rd50, %rd13, 16; ld.global.u32 %r10, [%rd57+24]; // begin inline asm cvta.to.global.u64 %rd49, %rd50;red.global.add.f32 [%rd49], %r10; // end inline asm add.s64 %rd52, %rd13, 12; ld.global.u32 %r11, [%rd57+20]; // begin inline asm cvta.to.global.u64 %rd51, %rd52;red.global.add.f32 [%rd51], %r11; // end inline asm $L__BB11_10: ret; $L__BB11_6: trap; $L__BB11_8: trap; } // .globl update_block_particle_count .visible .entry update_block_particle_count( .param .u64 update_block_particle_count_param_0, .param .u32 update_block_particle_count_param_1, .param .align 8 .b8 update_block_particle_count_param_2[72] ) { .reg .pred %p<13>; .reg .f32 %f<15>; .reg .b32 %r<36>; .reg .b64 %rd<62>; ld.param.u64 %rd11, [update_block_particle_count_param_0]; ld.param.u32 %r4, [update_block_particle_count_param_1]; ld.param.u32 %r3, [update_block_particle_count_param_2+40]; ld.param.u64 %rd15, [update_block_particle_count_param_2+32]; ld.param.u64 %rd13, [update_block_particle_count_param_2+16]; ld.param.f32 %f2, [update_block_particle_count_param_2]; mov.u32 %r5, %ntid.z; mov.u32 %r6, %ntid.y; mov.u32 %r7, %ntid.x; mov.b64 %rd19, {%r7, %r6}; mov.u32 %r8, %ctaid.z; mov.u32 %r9, %nctaid.y; mov.u32 %r10, %ctaid.y; mad.lo.s32 %r11, %r8, %r9, %r10; mov.u32 %r12, %nctaid.x; mov.u32 %r13, %ctaid.x; mad.lo.s32 %r14, %r11, %r12, %r13; and.b64 %rd20, %rd19, 4294967295; cvt.u64.u32 %rd21, %r6; bfi.b64 %rd22, %rd21, %rd20, 32, 32; cvt.u64.u32 %rd23, %r5; mov.b64 {%r15, %r16}, %rd22; mov.b64 {%r17, %r18}, %rd23; mul.lo.s32 %r19, %r15, %r14; mul.lo.s32 %r20, %r19, %r16; mov.u32 %r21, %tid.z; mov.u32 %r22, %tid.y; mad.lo.s32 %r23, %r21, %r6, %r22; mov.u32 %r24, %tid.x; mad.lo.s32 %r25, %r23, %r7, %r24; mad.lo.s32 %r1, %r20, %r17, %r25; setp.ge.u32 %p1, %r1, %r4; @%p1 bra $L__BB12_8; cvta.to.global.u64 %rd24, %rd11; cvta.to.global.u64 %rd1, %rd15; mul.wide.u32 %rd25, %r1, 8; add.s64 %rd26, %rd24, %rd25; ld.global.u32 %rd27, [%rd26]; ld.global.u32 %rd28, [%rd26+4]; bfi.b64 %rd29, %rd28, %rd27, 32, 32; cvt.u32.u64 %r26, %rd29; mov.b32 %f3, %r26; div.rn.f32 %f4, %f3, %f2; shr.u64 %rd30, %rd29, 32; cvt.u32.u64 %r27, %rd30; mov.b32 %f5, %r27; div.rn.f32 %f6, %f5, %f2; mov.b32 %r28, %f4; and.b32 %r29, %r28, -2147483648; or.b32 %r30, %r29, 1056964608; mov.b32 %f7, %r30; add.rz.f32 %f8, %f4, %f7; cvt.rzi.f32.f32 %f9, %f8; setp.leu.f32 %p2, %f9, 0f5EFFFFFF; max.f32 %f10, %f9, 0fDF000000; cvt.rzi.s64.f32 %rd31, %f10; setp.num.f32 %p3, %f9, %f9; mov.b32 %r31, %f6; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r32, 1056964608; mov.b32 %f11, %r33; add.rz.f32 %f12, %f6, %f11; cvt.rzi.f32.f32 %f13, %f12; setp.leu.f32 %p4, %f13, 0f5EFFFFFF; max.f32 %f14, %f13, 0fDF000000; cvt.rzi.s64.f32 %rd32, %f14; setp.num.f32 %p5, %f13, %f13; add.s64 %rd33, %rd31, 8589934590; shr.u64 %rd34, %rd33, 2; shl.b64 %rd35, %rd32, 30; and.b64 %rd36, %rd34, 4294967295; and.pred %p6, %p3, %p2; selp.b64 %rd37, %rd36, 2147483647, %p6; add.s64 %rd38, %rd35, 9223372034707292160; and.b64 %rd39, %rd38, -4294967296; and.pred %p7, %p5, %p4; selp.b64 %rd40, %rd39, 9223372032559808512, %p7; or.b64 %rd2, %rd40, %rd37; shr.u64 %rd41, %rd2, 16; xor.b64 %rd42, %rd41, %rd2; mul.lo.s64 %rd43, %rd42, 2246822507; shr.u64 %rd44, %rd43, 13; xor.b64 %rd45, %rd44, %rd43; mul.lo.s64 %rd46, %rd45, 3266489909; shr.u64 %rd47, %rd46, 16; xor.b64 %rd48, %rd47, %rd46; cvt.u64.u32 %rd49, %r3; add.s64 %rd3, %rd49, -1; and.b64 %rd60, %rd48, %rd3; shl.b64 %rd50, %rd60, 4; add.s64 %rd51, %rd1, %rd50; ld.global.u64 %rd5, [%rd51]; setp.eq.s64 %p8, %rd5, %rd2; @%p8 bra $L__BB12_6; setp.eq.s64 %p9, %rd5, -1; @%p9 bra $L__BB12_8; $L__BB12_4: add.s64 %rd52, %rd60, 1; and.b64 %rd60, %rd52, %rd3; shl.b64 %rd53, %rd60, 4; add.s64 %rd54, %rd1, %rd53; ld.global.u64 %rd8, [%rd54]; setp.eq.s64 %p10, %rd8, %rd2; @%p10 bra $L__BB12_6; setp.eq.s64 %p11, %rd8, -1; @%p11 bra $L__BB12_8; bra.uni $L__BB12_4; $L__BB12_6: shl.b64 %rd55, %rd60, 4; add.s64 %rd56, %rd1, %rd55; ld.global.u32 %r34, [%rd56+8]; mul.wide.u32 %rd57, %r34, 24; add.s64 %rd10, %rd13, %rd57; setp.eq.s64 %p12, %rd10, 0; @%p12 bra $L__BB12_8; add.s64 %rd59, %rd10, 12; mov.u32 %r35, 1; // begin inline asm cvta.to.global.u64 %rd58, %rd59;red.global.add.u32 [%rd58], %r35; // end inline asm $L__BB12_8: ret; } // .globl copy_particles_len_to_scan_value .visible .entry copy_particles_len_to_scan_value( .param .align 8 .b8 copy_particles_len_to_scan_value_param_0[72], .param .u64 copy_particles_len_to_scan_value_param_1 ) { .reg .pred %p<2>; .reg .f32 %f<2>; .reg .b32 %r<27>; .reg .b64 %rd<21>; ld.param.u64 %rd8, [copy_particles_len_to_scan_value_param_1]; ld.param.u64 %rd3, [copy_particles_len_to_scan_value_param_0+24]; ld.param.u64 %rd2, [copy_particles_len_to_scan_value_param_0+16]; cvta.to.global.u64 %rd9, %rd3; mov.u32 %r4, %ntid.z; mov.u32 %r5, %ntid.y; mov.u32 %r6, %ntid.x; mov.b64 %rd10, {%r6, %r5}; mov.u32 %r7, %ctaid.z; mov.u32 %r8, %nctaid.y; mov.u32 %r9, %ctaid.y; mad.lo.s32 %r10, %r7, %r8, %r9; mov.u32 %r11, %nctaid.x; mov.u32 %r12, %ctaid.x; mad.lo.s32 %r13, %r10, %r11, %r12; and.b64 %rd11, %rd10, 4294967295; cvt.u64.u32 %rd12, %r5; bfi.b64 %rd13, %rd12, %rd11, 32, 32; cvt.u64.u32 %rd14, %r4; mov.b64 {%r14, %r15}, %rd13; mov.b64 {%r16, %r17}, %rd14; mul.lo.s32 %r18, %r14, %r13; mul.lo.s32 %r19, %r18, %r15; mov.u32 %r20, %tid.z; mov.u32 %r21, %tid.y; mad.lo.s32 %r22, %r20, %r5, %r21; mov.u32 %r23, %tid.x; mad.lo.s32 %r24, %r22, %r6, %r23; mad.lo.s32 %r1, %r19, %r16, %r24; ld.global.u32 %r25, [%rd9]; setp.ge.u32 %p1, %r1, %r25; @%p1 bra $L__BB13_2; cvta.to.global.u64 %rd15, %rd8; mul.wide.u32 %rd16, %r1, 4; add.s64 %rd17, %rd15, %rd16; cvta.to.global.u64 %rd18, %rd2; mul.wide.u32 %rd19, %r1, 24; add.s64 %rd20, %rd18, %rd19; ld.global.u32 %r26, [%rd20+12]; st.global.u32 [%rd17], %r26; $L__BB13_2: ret; } // .globl copy_scan_values_to_first_particles .visible .entry copy_scan_values_to_first_particles( .param .align 8 .b8 copy_scan_values_to_first_particles_param_0[72], .param .u64 copy_scan_values_to_first_particles_param_1 ) { .reg .pred %p<2>; .reg .f32 %f<2>; .reg .b32 %r<27>; .reg .b64 %rd<21>; ld.param.u64 %rd8, [copy_scan_values_to_first_particles_param_1]; ld.param.u64 %rd3, [copy_scan_values_to_first_particles_param_0+24]; ld.param.u64 %rd2, [copy_scan_values_to_first_particles_param_0+16]; cvta.to.global.u64 %rd9, %rd3; mov.u32 %r4, %ntid.z; mov.u32 %r5, %ntid.y; mov.u32 %r6, %ntid.x; mov.b64 %rd10, {%r6, %r5}; mov.u32 %r7, %ctaid.z; mov.u32 %r8, %nctaid.y; mov.u32 %r9, %ctaid.y; mad.lo.s32 %r10, %r7, %r8, %r9; mov.u32 %r11, %nctaid.x; mov.u32 %r12, %ctaid.x; mad.lo.s32 %r13, %r10, %r11, %r12; and.b64 %rd11, %rd10, 4294967295; cvt.u64.u32 %rd12, %r5; bfi.b64 %rd13, %rd12, %rd11, 32, 32; cvt.u64.u32 %rd14, %r4; mov.b64 {%r14, %r15}, %rd13; mov.b64 {%r16, %r17}, %rd14; mul.lo.s32 %r18, %r14, %r13; mul.lo.s32 %r19, %r18, %r15; mov.u32 %r20, %tid.z; mov.u32 %r21, %tid.y; mad.lo.s32 %r22, %r20, %r5, %r21; mov.u32 %r23, %tid.x; mad.lo.s32 %r24, %r22, %r6, %r23; mad.lo.s32 %r1, %r19, %r16, %r24; ld.global.u32 %r25, [%rd9]; setp.ge.u32 %p1, %r1, %r25; @%p1 bra $L__BB14_2; cvta.to.global.u64 %rd15, %rd8; cvta.to.global.u64 %rd16, %rd2; mul.wide.u32 %rd17, %r1, 24; add.s64 %rd18, %rd16, %rd17; mul.wide.u32 %rd19, %r1, 4; add.s64 %rd20, %rd15, %rd19; ld.global.u32 %r26, [%rd20]; st.global.u32 [%rd18+8], %r26; $L__BB14_2: ret; } // .globl finalize_particles_sort .visible .entry finalize_particles_sort( .param .u64 finalize_particles_sort_param_0, .param .u32 finalize_particles_sort_param_1, .param .align 8 .b8 finalize_particles_sort_param_2[72], .param .u64 finalize_particles_sort_param_3, .param .u64 finalize_particles_sort_param_4 ) { .reg .pred %p<12>; .reg .f32 %f<15>; .reg .b32 %r<59>; .reg .b64 %rd<71>; ld.param.u64 %rd10, [finalize_particles_sort_param_0]; ld.param.u32 %r4, [finalize_particles_sort_param_1]; ld.param.u64 %rd18, [finalize_particles_sort_param_3]; ld.param.u64 %rd19, [finalize_particles_sort_param_4]; ld.param.u32 %r3, [finalize_particles_sort_param_2+40]; ld.param.u64 %rd14, [finalize_particles_sort_param_2+32]; ld.param.f32 %f2, [finalize_particles_sort_param_2]; mov.u32 %r5, %ntid.z; mov.u32 %r6, %ntid.y; mov.u32 %r7, %ntid.x; mov.b64 %rd20, {%r7, %r6}; mov.u32 %r8, %ctaid.z; mov.u32 %r9, %nctaid.y; mov.u32 %r10, %ctaid.y; mad.lo.s32 %r11, %r8, %r9, %r10; mov.u32 %r12, %nctaid.x; mov.u32 %r13, %ctaid.x; mad.lo.s32 %r14, %r11, %r12, %r13; and.b64 %rd21, %rd20, 4294967295; cvt.u64.u32 %rd22, %r6; bfi.b64 %rd23, %rd22, %rd21, 32, 32; cvt.u64.u32 %rd24, %r5; mov.b64 {%r15, %r16}, %rd23; mov.b64 {%r17, %r18}, %rd24; mul.lo.s32 %r19, %r15, %r14; mul.lo.s32 %r20, %r19, %r16; mov.u32 %r21, %tid.z; mov.u32 %r22, %tid.y; mad.lo.s32 %r23, %r21, %r6, %r22; mov.u32 %r24, %tid.x; mad.lo.s32 %r25, %r23, %r7, %r24; mad.lo.s32 %r1, %r20, %r17, %r25; setp.ge.u32 %p1, %r1, %r4; @%p1 bra $L__BB15_7; cvta.to.global.u64 %rd25, %rd10; cvta.to.global.u64 %rd1, %rd14; mul.wide.u32 %rd26, %r1, 8; add.s64 %rd27, %rd25, %rd26; ld.global.u32 %rd28, [%rd27]; ld.global.u32 %rd29, [%rd27+4]; bfi.b64 %rd30, %rd29, %rd28, 32, 32; cvt.u32.u64 %r26, %rd30; mov.b32 %f3, %r26; div.rn.f32 %f4, %f3, %f2; shr.u64 %rd31, %rd30, 32; cvt.u32.u64 %r27, %rd31; mov.b32 %f5, %r27; div.rn.f32 %f6, %f5, %f2; mov.b32 %r28, %f4; and.b32 %r29, %r28, -2147483648; or.b32 %r30, %r29, 1056964608; mov.b32 %f7, %r30; add.rz.f32 %f8, %f4, %f7; cvt.rzi.f32.f32 %f9, %f8; setp.leu.f32 %p2, %f9, 0f5EFFFFFF; max.f32 %f10, %f9, 0fDF000000; cvt.rzi.s64.f32 %rd32, %f10; setp.num.f32 %p3, %f9, %f9; mov.b32 %r31, %f6; and.b32 %r32, %r31, -2147483648; or.b32 %r33, %r32, 1056964608; mov.b32 %f11, %r33; add.rz.f32 %f12, %f6, %f11; cvt.rzi.f32.f32 %f13, %f12; setp.leu.f32 %p4, %f13, 0f5EFFFFFF; max.f32 %f14, %f13, 0fDF000000; cvt.rzi.s64.f32 %rd33, %f14; setp.num.f32 %p5, %f13, %f13; add.s64 %rd34, %rd32, 8589934590; shr.u64 %rd35, %rd34, 2; shl.b64 %rd36, %rd33, 30; and.b64 %rd37, %rd35, 4294967295; and.pred %p6, %p3, %p2; selp.b64 %rd38, %rd37, 2147483647, %p6; add.s64 %rd39, %rd36, 9223372034707292160; and.b64 %rd40, %rd39, -4294967296; and.pred %p7, %p5, %p4; selp.b64 %rd41, %rd40, 9223372032559808512, %p7; or.b64 %rd2, %rd41, %rd38; shr.u64 %rd42, %rd2, 16; xor.b64 %rd43, %rd42, %rd2; mul.lo.s64 %rd44, %rd43, 2246822507; shr.u64 %rd45, %rd44, 13; xor.b64 %rd46, %rd45, %rd44; mul.lo.s64 %rd47, %rd46, 3266489909; shr.u64 %rd48, %rd47, 16; xor.b64 %rd49, %rd48, %rd47; cvt.u64.u32 %rd50, %r3; add.s64 %rd3, %rd50, -1; and.b64 %rd69, %rd49, %rd3; shl.b64 %rd51, %rd69, 4; add.s64 %rd52, %rd1, %rd51; ld.global.u64 %rd5, [%rd52]; setp.eq.s64 %p8, %rd5, %rd2; @%p8 bra $L__BB15_6; setp.eq.s64 %p9, %rd5, -1; @%p9 bra $L__BB15_7; $L__BB15_4: add.s64 %rd53, %rd69, 1; and.b64 %rd69, %rd53, %rd3; shl.b64 %rd54, %rd69, 4; add.s64 %rd55, %rd1, %rd54; ld.global.u64 %rd8, [%rd55]; setp.eq.s64 %p10, %rd8, %rd2; @%p10 bra $L__BB15_6; setp.eq.s64 %p11, %rd8, -1; @%p11 bra $L__BB15_7; bra.uni $L__BB15_4; $L__BB15_6: shl.b64 %rd58, %rd69, 4; add.s64 %rd59, %rd1, %rd58; ld.global.u32 %r36, [%rd59+8]; mul.wide.u32 %rd60, %r36, 4; add.s64 %rd57, %rd18, %rd60; mov.u32 %r35, 1; // begin inline asm cvta.to.global.u64 %rd56, %rd57;atom.global.add.u32 %r34, [%rd56], %r35; // end inline asm cvta.to.global.u64 %rd61, %rd19; mul.wide.u32 %rd62, %r34, 4; add.s64 %rd63, %rd61, %rd62; st.global.u32 [%rd63], %r1; $L__BB15_7: ret; } // .globl write_blocks_multiplicity_to_scan_value .visible .entry write_blocks_multiplicity_to_scan_value( .param .align 8 .b8 write_blocks_multiplicity_to_scan_value_param_0[72], .param .u64 write_blocks_multiplicity_to_scan_value_param_1, .param .u64 write_blocks_multiplicity_to_scan_value_param_2, .param .u32 write_blocks_multiplicity_to_scan_value_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<2>; .reg .b32 %r<36>; .reg .b64 %rd<24>; ld.param.u64 %rd8, [write_blocks_multiplicity_to_scan_value_param_1]; ld.param.u64 %rd9, [write_blocks_multiplicity_to_scan_value_param_2]; ld.param.u32 %r4, [write_blocks_multiplicity_to_scan_value_param_3]; ld.param.u64 %rd3, [write_blocks_multiplicity_to_scan_value_param_0+24]; ld.param.u64 %rd2, [write_blocks_multiplicity_to_scan_value_param_0+16]; cvta.to.global.u64 %rd10, %rd3; mov.u32 %r5, %ntid.z; mov.u32 %r6, %ntid.y; mov.u32 %r7, %ntid.x; mov.b64 %rd11, {%r7, %r6}; mov.u32 %r8, %ctaid.z; mov.u32 %r9, %nctaid.y; mov.u32 %r10, %ctaid.y; mad.lo.s32 %r11, %r8, %r9, %r10; mov.u32 %r12, %nctaid.x; mov.u32 %r13, %ctaid.x; mad.lo.s32 %r14, %r11, %r12, %r13; and.b64 %rd12, %rd11, 4294967295; cvt.u64.u32 %rd13, %r6; bfi.b64 %rd14, %rd13, %rd12, 32, 32; cvt.u64.u32 %rd15, %r5; mov.b64 {%r15, %r16}, %rd14; mov.b64 {%r17, %r18}, %rd15; mul.lo.s32 %r19, %r15, %r14; mul.lo.s32 %r20, %r19, %r16; mov.u32 %r21, %tid.z; mov.u32 %r22, %tid.y; mad.lo.s32 %r23, %r21, %r6, %r22; mov.u32 %r24, %tid.x; mad.lo.s32 %r25, %r23, %r7, %r24; mad.lo.s32 %r1, %r20, %r17, %r25; ld.global.u32 %r26, [%rd10]; setp.ge.u32 %p1, %r1, %r26; @%p1 bra $L__BB16_3; setp.eq.s32 %p2, %r4, 0; @%p2 bra $L__BB16_4; cvta.to.global.u64 %rd16, %rd2; mul.wide.u32 %rd17, %r1, 24; add.s64 %rd18, %rd16, %rd17; ld.global.u32 %r27, [%rd18+12]; div.u32 %r28, %r27, %r4; mul.lo.s32 %r29, %r28, %r4; setp.ne.s32 %p3, %r27, %r29; selp.u32 %r30, 1, 0, %p3; add.s32 %r31, %r28, %r30; ld.global.u32 %r32, [%rd18+16]; and.b32 %r33, %r32, 3; setp.eq.s32 %p4, %r33, 0; selp.b32 %r34, %r31, 0, %p4; selp.b32 %r35, 0, %r31, %p4; cvta.to.global.u64 %rd19, %rd8; mul.wide.u32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; st.global.u32 [%rd21], %r34; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd20; st.global.u32 [%rd23], %r35; $L__BB16_3: ret; $L__BB16_4: trap; } // .globl init_gpu_dispatch_blocks_mapping .visible .entry init_gpu_dispatch_blocks_mapping( .param .align 8 .b8 init_gpu_dispatch_blocks_mapping_param_0[72], .param .u64 init_gpu_dispatch_blocks_mapping_param_1, .param .u64 init_gpu_dispatch_blocks_mapping_param_2, .param .u32 init_gpu_dispatch_blocks_mapping_param_3 ) { .reg .pred %p<6>; .reg .b16 %rs<3>; .reg .f32 %f<2>; .reg .b32 %r<19>; .reg .b64 %rd<23>; ld.param.u64 %rd11, [init_gpu_dispatch_blocks_mapping_param_1]; ld.param.u64 %rd12, [init_gpu_dispatch_blocks_mapping_param_2]; ld.param.u32 %r11, [init_gpu_dispatch_blocks_mapping_param_3]; ld.param.u64 %rd9, [init_gpu_dispatch_blocks_mapping_param_0+56]; ld.param.u64 %rd8, [init_gpu_dispatch_blocks_mapping_param_0+48]; ld.param.u64 %rd5, [init_gpu_dispatch_blocks_mapping_param_0+16]; mov.u32 %r18, %tid.x; mov.u32 %r2, %ctaid.x; setp.eq.s32 %p1, %r11, 0; @%p1 bra $L__BB17_5; cvt.u64.u32 %rd1, %r2; cvta.to.global.u64 %rd13, %rd5; mul.wide.u32 %rd14, %r2, 24; add.s64 %rd15, %rd13, %rd14; add.s64 %rd2, %rd15, 16; ld.global.u32 %r12, [%rd15+12]; div.u32 %r13, %r12, %r11; mul.lo.s32 %r14, %r13, %r11; setp.ne.s32 %p2, %r12, %r14; selp.u32 %r15, 1, 0, %p2; add.s32 %r3, %r13, %r15; setp.ge.u32 %p3, %r18, %r3; @%p3 bra $L__BB17_4; ld.global.u32 %r4, [%rd2+-8]; ld.global.u8 %rs1, [%rd2]; and.b16 %rs2, %rs1, 3; setp.ne.s16 %p4, %rs2, 0; selp.b64 %rd16, %rd12, %rd11, %p4; cvta.to.global.u64 %rd17, %rd16; shl.b64 %rd18, %rd1, 2; add.s64 %rd19, %rd17, %rd18; ld.global.u32 %r5, [%rd19]; mov.u32 %r6, %ntid.x; selp.b64 %rd20, %rd9, %rd8, %p4; cvta.to.global.u64 %rd3, %rd20; $L__BB17_3: mad.lo.s32 %r16, %r18, %r11, %r4; add.s32 %r17, %r18, %r5; mul.wide.u32 %rd21, %r17, 8; add.s64 %rd22, %rd3, %rd21; st.global.u32 [%rd22], %r2; st.global.u32 [%rd22+4], %r16; add.s32 %r18, %r18, %r6; setp.lt.u32 %p5, %r18, %r3; @%p5 bra $L__BB17_3; $L__BB17_4: ret; $L__BB17_5: trap; } // .globl estimate_timestep_length .visible .entry estimate_timestep_length( .param .f32 estimate_timestep_length_param_0, .param .f32 estimate_timestep_length_param_1, .param .u64 estimate_timestep_length_param_2, .param .u64 estimate_timestep_length_param_3, .param .u64 estimate_timestep_length_param_4, .param .u64 estimate_timestep_length_param_5, .param .u64 estimate_timestep_length_param_6, .param .f32 estimate_timestep_length_param_7, .param .u64 estimate_timestep_length_param_8 ) { .reg .pred %p<31>; .reg .b16 %rs<3>; .reg .f32 %f<191>; .reg .b32 %r<40>; .reg .b64 %rd<37>; ld.param.f32 %f26, [estimate_timestep_length_param_0]; ld.param.f32 %f27, [estimate_timestep_length_param_1]; ld.param.u64 %rd9, [estimate_timestep_length_param_2]; ld.param.u64 %rd10, [estimate_timestep_length_param_3]; ld.param.u64 %rd11, [estimate_timestep_length_param_4]; ld.param.u64 %rd12, [estimate_timestep_length_param_6]; ld.param.f32 %f28, [estimate_timestep_length_param_7]; ld.param.u64 %rd13, [estimate_timestep_length_param_8]; mov.u32 %r4, %ntid.z; mov.u32 %r5, %ntid.y; mov.u32 %r6, %ntid.x; mov.b64 %rd14, {%r6, %r5}; mov.u32 %r7, %ctaid.z; mov.u32 %r8, %nctaid.y; mov.u32 %r9, %ctaid.y; mad.lo.s32 %r10, %r7, %r8, %r9; mov.u32 %r11, %nctaid.x; mov.u32 %r12, %ctaid.x; mad.lo.s32 %r13, %r10, %r11, %r12; and.b64 %rd15, %rd14, 4294967295; cvt.u64.u32 %rd16, %r5; bfi.b64 %rd17, %rd16, %rd15, 32, 32; cvt.u64.u32 %rd18, %r4; mov.b64 {%r14, %r15}, %rd17; mov.b64 {%r16, %r17}, %rd18; mul.lo.s32 %r18, %r14, %r13; mul.lo.s32 %r19, %r18, %r15; mov.u32 %r20, %tid.z; mov.u32 %r21, %tid.y; mad.lo.s32 %r22, %r20, %r5, %r21; mov.u32 %r23, %tid.x; mad.lo.s32 %r24, %r22, %r6, %r23; mad.lo.s32 %r1, %r19, %r16, %r24; ld.param.u32 %r25, [estimate_timestep_length_param_5]; setp.ge.u32 %p1, %r1, %r25; @%p1 bra $L__BB18_22; cvt.u64.u32 %rd1, %r1; cvta.to.global.u64 %rd19, %rd9; mul.wide.u32 %rd20, %r1, 24; add.s64 %rd2, %rd19, %rd20; ld.global.u8 %rs1, [%rd2]; setp.ne.s16 %p2, %rs1, 0; @%p2 bra $L__BB18_22; ld.global.u64 %rd3, [%rd2+16]; cvta.to.global.u64 %rd21, %rd12; mul.lo.s64 %rd22, %rd3, 96; add.s64 %rd4, %rd21, %rd22; ld.global.u32 %r2, [%rd4]; setp.eq.s32 %p3, %r2, 3; @%p3 bra $L__BB18_20; bra.uni $L__BB18_3; $L__BB18_20: mov.f32 %f182, 0f7F7FFFFF; min.f32 %f190, %f27, %f182; bra.uni $L__BB18_21; $L__BB18_3: shl.b64 %rd23, %rd1, 5; shl.b64 %rd24, %rd1, 3; cvt.u16.u32 %rs2, %r2; cvta.to.global.u64 %rd25, %rd11; add.s64 %rd7, %rd25, %rd24; cvta.to.global.u64 %rd26, %rd10; add.s64 %rd8, %rd26, %rd23; setp.eq.s16 %p4, %rs2, 1; @%p4 bra $L__BB18_17; setp.eq.s16 %p5, %rs2, 2; mov.f32 %f187, 0f3F800000; @%p5 bra $L__BB18_7; setp.ne.s16 %p6, %rs2, 3; @%p6 bra $L__BB18_18; ld.global.f32 %f30, [%rd7]; ld.global.f32 %f31, [%rd7+4]; mul.f32 %f32, %f31, %f31; fma.rn.f32 %f33, %f30, %f30, %f32; add.f32 %f188, %f33, 0f00000000; mov.f32 %f189, 0f00000000; bra.uni $L__BB18_19; $L__BB18_17: ld.global.u64 %rd27, [%rd4+24]; shl.b64 %rd28, %rd1, 4; add.s64 %rd29, %rd27, %rd28; ld.f32 %f135, [%rd29+8]; ld.global.f32 %f136, [%rd8+4]; ld.global.f32 %f137, [%rd8]; div.rn.f32 %f138, %f137, %f136; ld.global.f32 %f139, [%rd4+16]; add.f32 %f140, %f139, %f139; div.rn.f32 %f141, %f140, 0f40400000; ld.global.f32 %f142, [%rd4+12]; add.f32 %f143, %f142, %f141; mul.f32 %f144, %f143, %f28; mul.f32 %f145, %f139, %f28; fma.rn.f32 %f146, %f145, 0f3FAAAAAB, %f144; div.rn.f32 %f147, %f146, %f138; sqrt.rn.f32 %f148, %f147; ld.global.f32 %f149, [%rd7]; ld.global.f32 %f150, [%rd7+4]; mul.f32 %f151, %f150, %f150; fma.rn.f32 %f152, %f149, %f149, %f151; add.f32 %f188, %f152, 0f00000000; sqrt.rn.f32 %f153, %f188; max.f32 %f154, %f153, %f148; ld.global.f32 %f155, [%rd4+8]; mul.f32 %f156, %f135, %f155; div.rn.f32 %f189, %f156, %f154; bra.uni $L__BB18_19; $L__BB18_7: ld.global.f32 %f2, [%rd8+12]; ld.global.f32 %f37, [%rd8]; ld.global.f32 %f38, [%rd8+4]; div.rn.f32 %f3, %f37, %f38; div.rn.f32 %f39, %f3, %f2; ld.global.f32 %f4, [%rd4+8]; div.rn.f32 %f5, %f39, %f3; ld.global.u32 %r3, [%rd4+12]; cvt.rn.f32.s32 %f6, %r3; mul.f32 %f40, %f6, 0f3F000000; cvt.rzi.f32.f32 %f41, %f40; add.f32 %f42, %f41, %f41; sub.f32 %f43, %f6, %f42; abs.f32 %f7, %f43; abs.f32 %f8, %f5; setp.lt.f32 %p7, %f8, 0f00800000; mul.f32 %f44, %f8, 0f4B800000; selp.f32 %f45, %f44, %f8, %p7; selp.f32 %f46, 0fC1C00000, 0f00000000, %p7; mov.b32 %r26, %f45; add.s32 %r27, %r26, -1060439283; and.b32 %r28, %r27, -8388608; sub.s32 %r29, %r26, %r28; mov.b32 %f47, %r29; cvt.rn.f32.s32 %f48, %r28; mov.f32 %f49, 0f34000000; fma.rn.f32 %f50, %f48, %f49, %f46; add.f32 %f51, %f47, 0fBF800000; add.f32 %f35, %f47, 0f3F800000; mov.f32 %f36, 0f3F800000; // begin inline asm rcp.approx.ftz.f32 %f34,%f35; // end inline asm add.f32 %f52, %f51, %f51; mul.f32 %f53, %f34, %f52; mul.f32 %f54, %f53, %f53; sub.f32 %f55, %f51, %f53; add.f32 %f56, %f55, %f55; neg.f32 %f57, %f53; fma.rn.f32 %f58, %f57, %f51, %f56; mul.rn.f32 %f59, %f34, %f58; mov.f32 %f60, 0f3B52E7DB; mov.f32 %f61, 0f3A2C32E4; fma.rn.f32 %f62, %f61, %f54, %f60; mov.f32 %f63, 0f3C93BB73; fma.rn.f32 %f64, %f62, %f54, %f63; mov.f32 %f65, 0f3DF6384F; fma.rn.f32 %f66, %f64, %f54, %f65; mul.rn.f32 %f67, %f66, %f54; mov.f32 %f68, 0f3FB8AA3B; fma.rn.f32 %f69, %f53, %f68, %f50; sub.f32 %f70, %f50, %f69; fma.rn.f32 %f71, %f53, %f68, %f70; fma.rn.f32 %f72, %f59, %f68, %f71; mov.f32 %f73, 0f32A55E34; fma.rn.f32 %f74, %f53, %f73, %f72; mul.f32 %f75, %f67, 0f40400000; fma.rn.f32 %f76, %f75, %f59, %f74; fma.rn.f32 %f77, %f67, %f53, %f76; add.rn.f32 %f78, %f69, %f77; neg.f32 %f79, %f69; add.rn.f32 %f80, %f78, %f79; neg.f32 %f81, %f80; add.rn.f32 %f82, %f77, %f81; mul.rn.f32 %f83, %f78, %f6; neg.f32 %f84, %f83; fma.rn.f32 %f85, %f78, %f6, %f84; fma.rn.f32 %f86, %f82, %f6, %f85; cvt.rni.f32.f32 %f87, %f83; sub.f32 %f88, %f83, %f87; add.f32 %f89, %f86, %f88; mov.f32 %f90, 0f3AAF85ED; mov.f32 %f91, 0f391FCB8E; fma.rn.f32 %f92, %f91, %f89, %f90; mov.f32 %f93, 0f3C1D9856; fma.rn.f32 %f94, %f92, %f89, %f93; mov.f32 %f95, 0f3D6357BB; fma.rn.f32 %f96, %f94, %f89, %f95; mov.f32 %f97, 0f3E75FDEC; fma.rn.f32 %f98, %f96, %f89, %f97; mov.f32 %f99, 0f3F317218; fma.rn.f32 %f100, %f98, %f89, %f99; fma.rn.f32 %f101, %f100, %f89, %f36; cvt.rzi.s32.f32 %r30, %f87; setp.gt.f32 %p8, %f87, 0f00000000; selp.b32 %r31, 0, -2097152000, %p8; add.s32 %r32, %r31, 2130706432; mov.b32 %f102, %r32; mul.f32 %f103, %f101, %f102; shl.b32 %r33, %r30, 23; sub.s32 %r34, %r33, %r31; mov.b32 %f104, %r34; mul.f32 %f105, %f103, %f104; abs.f32 %f106, %f83; setp.gt.f32 %p9, %f106, 0f43180000; setp.lt.f32 %p10, %f83, 0f00000000; selp.f32 %f107, 0f00000000, 0f7F800000, %p10; selp.f32 %f9, %f107, %f105, %p9; setp.eq.f32 %p11, %f5, 0f3F800000; setp.eq.s32 %p12, %r3, 0; or.pred %p13, %p11, %p12; @%p13 bra $L__BB18_16; setp.gtu.f32 %p14, %f8, 0f7F800000; @%p14 bra $L__BB18_15; abs.f32 %f10, %f6; setp.gtu.f32 %p15, %f10, 0f7F800000; @%p15 bra $L__BB18_15; bra.uni $L__BB18_10; $L__BB18_15: add.rn.f32 %f187, %f5, %f6; $L__BB18_16: add.f32 %f113, %f187, 0fBF800000; mul.f32 %f114, %f4, %f113; ld.global.f32 %f115, [%rd4+20]; neg.f32 %f116, %f115; max.f32 %f117, %f114, %f116; add.f32 %f118, %f2, 0fBF800000; mul.f32 %f119, %f118, %f3; mul.f32 %f120, %f117, 0fC0C00000; fma.rn.f32 %f121, %f117, 0fC0C00000, %f120; div.rn.f32 %f122, %f119, %f121; sqrt.rn.f32 %f123, %f122; div.rn.f32 %f124, %f28, %f2; mul.f32 %f125, %f124, %f123; ld.global.f32 %f126, [%rd7]; ld.global.f32 %f127, [%rd7+4]; mul.f32 %f128, %f127, %f127; fma.rn.f32 %f129, %f126, %f126, %f128; add.f32 %f188, %f129, 0f00000000; max.f32 %f131, %f188, %f36; div.rn.f32 %f132, %f131, 0f3DCCCCCD; sqrt.rn.f32 %f133, %f132; div.rn.f32 %f134, %f28, %f133; min.f32 %f189, %f125, %f134; bra.uni $L__BB18_19; $L__BB18_18: ld.global.u64 %rd30, [%rd4+24]; shl.b64 %rd31, %rd1, 4; add.s64 %rd32, %rd30, %rd31; ld.f32 %f157, [%rd32+8]; ld.global.f32 %f158, [%rd8+4]; ld.global.f32 %f159, [%rd8]; div.rn.f32 %f160, %f159, %f158; ld.global.f32 %f161, [%rd4+20]; add.f32 %f162, %f161, %f161; div.rn.f32 %f163, %f162, 0f40400000; ld.global.f32 %f164, [%rd4+16]; add.f32 %f165, %f164, %f163; mul.f32 %f166, %f157, %f165; mul.f32 %f167, %f157, %f161; fma.rn.f32 %f168, %f167, 0f3FAAAAAB, %f166; div.rn.f32 %f169, %f168, %f160; sqrt.rn.f32 %f170, %f169; ld.global.f32 %f171, [%rd7]; ld.global.f32 %f172, [%rd7+4]; mul.f32 %f173, %f172, %f172; fma.rn.f32 %f174, %f171, %f171, %f173; add.f32 %f188, %f174, 0f00000000; sqrt.rn.f32 %f175, %f188; max.f32 %f176, %f175, %f170; ld.global.f32 %f177, [%rd4+12]; mul.f32 %f178, %f177, %f28; div.rn.f32 %f189, %f178, %f176; $L__BB18_19: sqrt.rn.f32 %f179, %f188; div.rn.f32 %f180, %f28, %f179; min.f32 %f181, %f27, %f189; min.f32 %f190, %f181, %f180; $L__BB18_21: setp.gt.f32 %p27, %f27, %f26; setp.lt.f32 %p28, %f190, %f26; and.pred %p29, %p27, %p28; selp.f32 %f183, %f26, %f190, %p29; mul.f32 %f184, %f183, 0f5368D4A5; setp.gt.f32 %p30, %f184, 0f5F7FFFFF; max.f32 %f185, %f184, 0f00000000; cvt.rzi.u64.f32 %rd36, %f185; selp.b64 %rd35, -1, %rd36, %p30; // begin inline asm cvta.to.global.u64 %rd33, %rd13;red.global.min.u64 [%rd33], %rd35; // end inline asm $L__BB18_22: ret; $L__BB18_10: setp.eq.f32 %p16, %f5, 0f00000000; setp.eq.f32 %p17, %f8, 0f7F800000; or.pred %p18, %p16, %p17; @%p18 bra $L__BB18_14; bra.uni $L__BB18_11; $L__BB18_14: setp.eq.f32 %p25, %f7, 0f3F800000; add.f32 %f112, %f5, %f5; mov.b32 %r35, %f112; xor.b32 %r36, %r35, 2139095040; setp.lt.s32 %p26, %r3, 0; selp.b32 %r37, %r36, %r35, %p26; and.b32 %r38, %r37, 2147483647; selp.b32 %r39, %r37, %r38, %p25; mov.b32 %f187, %r39; bra.uni $L__BB18_16; $L__BB18_11: setp.eq.f32 %p19, %f5, 0fBF800000; setp.eq.f32 %p20, %f10, 0f7F800000; and.pred %p21, %p19, %p20; @%p21 bra $L__BB18_16; setp.geu.f32 %p22, %f5, 0f00000000; mov.f32 %f187, %f9; @%p22 bra $L__BB18_16; setp.eq.f32 %p23, %f7, 0f3F800000; neg.f32 %f109, %f9; selp.f32 %f110, %f109, %f9, %p23; cvt.rmi.f32.f32 %f111, %f6; setp.neu.f32 %p24, %f111, %f6; selp.f32 %f187, 0f7FFFFFFF, %f110, %p24; bra.uni $L__BB18_16; } .func _ZN4core6result13unwrap_failed17h02aadeb87602f26eE() .noreturn { trap; }