local { def intvec{w,T} = 0 def intvec{w,T & match{'vector',typekind{T}} & isint{eltype{T}} & match{w,width{T}}} = 1 def num{T} = match{'number',kind{T}} } #SSE def __mulhi{a:T, b:T & match{T,[4]u16}} = emit{T, '_mm_mulhi_pu16', a, b} def __pmulhuw{a:T, b:T & match{T,[4]u16}} = emit{T, '_m_pmulhuw', a, b} def __add{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_add_ps', a, b} def __sub{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_sub_ps', a, b} def __mul{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_mul_ps', a, b} def __div{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_div_ps', a, b} def __cmpeq{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_cmpeq_ps', a, b} def __cmplt{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_cmplt_ps', a, b} def __cmple{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_cmple_ps', a, b} def __cmpgt{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_cmpgt_ps', a, b} def __cmpge{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_cmpge_ps', a, b} def __cmpneq{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_cmpneq_ps', a, b} def __cmpnlt{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_cmpnlt_ps', a, b} def __cmpnle{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_cmpnle_ps', a, b} def __cmpngt{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_cmpngt_ps', a, b} def __cmpnge{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_cmpnge_ps', a, b} def __cmpord{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_cmpord_ps', a, b} def __cmpunord{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_cmpunord_ps', a, b} def __cvt{a:T, b:S & match{T,[4]f32} & match{S,i32}} = emit{T, '_mm_cvt_si2ss', a, b} def __cvtpi32{a:T, b:S & match{T,[4]f32} & match{S,[2]i32}} = emit{T, '_mm_cvtpi32_ps', a, b} def __cvt{a:T, b:S & match{T,[4]f32} & match{S,[2]i32}} = emit{T, '_mm_cvt_pi2ps', a, b} def __cvtpi16{a:T & match{T,[4]i16}} = emit{[4]f32, '_mm_cvtpi16_ps', a} def __cvtpu16{a:T & match{T,[4]u16}} = emit{[4]f32, '_mm_cvtpu16_ps', a} def __cvtpi8{a:T & match{T,[8]i8}} = emit{[4]f32, '_mm_cvtpi8_ps', a} def __cvtpu8{a:T & match{T,[8]u8}} = emit{[4]f32, '_mm_cvtpu8_ps', a} def __cvtpi32x2{a:T, b:T & match{T,[2]i32}} = emit{[4]f32, '_mm_cvtpi32x2_ps', a, b} def __cvtss_i32{a:T & match{T,[4]f32}} = emit{i32, '_mm_cvtss_si32', a} def __cvt_i32{a:T & match{T,[4]f32}} = emit{i32, '_mm_cvt_ss2si', a} def __cvtss_u64{a:T & match{T,[4]f32}} = emit{u64, '_mm_cvtss_si64', a} def __cvtss_f32{a:T & match{T,[4]f32}} = emit{f32, '_mm_cvtss_f32', a} def __cvtps_2f32{a:T & match{T,[4]f32}} = emit{[2]f32, '_mm_cvtps_pi32', a} def __cvt_2f32{a:T & match{T,[4]f32}} = emit{[2]f32, '_mm_cvt_ps2pi', a} def __cvttss_i32{a:T & match{T,[4]f32}} = emit{i32, '_mm_cvttss_si32', a} def __cvtt_i32{a:T & match{T,[4]f32}} = emit{i32, '_mm_cvtt_ss2si', a} def __cvttss_u64{a:T & match{T,[4]f32}} = emit{u64, '_mm_cvttss_si64', a} def __cvttps_2f32{a:T & match{T,[4]f32}} = emit{[2]f32, '_mm_cvttps_pi32', a} def __cvtt_2f32{a:T & match{T,[4]f32}} = emit{[2]f32, '_mm_cvtt_ps2pi', a} def __cvtps_4i16{a:T & match{T,[4]f32}} = emit{[4]i16, '_mm_cvtps_pi16', a} def __cvtps_8i8{a:T & match{T,[4]f32}} = emit{[8]i8, '_mm_cvtps_pi8', a} def __sqrt{a:T & match{T,[4]f32}} = emit{T, '_mm_sqrt_ps', a} def __rcp{a:T & match{T,[4]f32}} = emit{T, '_mm_rcp_ps', a} def __rsqrt{a:T & match{T,[4]f32}} = emit{T, '_mm_rsqrt_ps', a} def __getcsr{} = emit{u32, '_mm_getcsr'} def __setcsr{a:T & match{T,u32}} = emit{void, '_mm_setcsr', a} def __GET_EXCEPTION_STATE{} = emit{u32, '_MM_GET_EXCEPTION_STATE'} def __SET_EXCEPTION_STATE{a:T & match{T,u32}} = emit{void, '_MM_SET_EXCEPTION_STATE', a} def __GET_EXCEPTION_MASK{} = emit{u32, '_MM_GET_EXCEPTION_MASK'} def __SET_EXCEPTION_MASK{a:T & match{T,u32}} = emit{void, '_MM_SET_EXCEPTION_MASK', a} def __GET_ROUNDING_MODE{} = emit{u32, '_MM_GET_ROUNDING_MODE'} def __SET_ROUNDING_MODE{a:T & match{T,u32}} = emit{void, '_MM_SET_ROUNDING_MODE', a} def __GET_FLUSH_ZERO_MODE{} = emit{u32, '_MM_GET_FLUSH_ZERO_MODE'} def __SET_FLUSH_ZERO_MODE{a:T & match{T,u32}} = emit{void, '_MM_SET_FLUSH_ZERO_MODE', a} def __prefetch{p:T, i & match{T,__pnt{u8}} & num{i}} = emit{void, '_mm_prefetch', p, i} def __sfence{} = emit{void, '_mm_sfence'} def __malloc{size:T, align:T & match{T,u64}} = emit{__pnt{void}, '_mm_malloc', size, align} def __free{mem_addr:T & match{T,__pnt{void}}} = emit{void, '_mm_free', mem_addr} def __undefined_4f32{} = emit{[4]f32, '_mm_undefined_ps'} def __loadh{a:T, mem_addr:S & match{T,[4]f32} & match{S,__pnt{[2]f32}}} = emit{T, '_mm_loadh_pi', a, mem_addr} def __loadl{a:T, mem_addr:S & match{T,[4]f32} & match{S,__pnt{[2]f32}}} = emit{T, '_mm_loadl_pi', a, mem_addr} def __load1{mem_addr:T & match{T,__pnt{f32}}} = emit{[4]f32, '_mm_load1_ps', mem_addr} def __load_ps1_4f32{mem_addr:T & match{T,__pnt{f32}}} = emit{[4]f32, '_mm_load_ps1', mem_addr} def __load_ps_4f32{mem_addr:T & match{T,__pnt{f32}}} = emit{[4]f32, '_mm_load_ps', mem_addr} def __loadu_4f32{mem_addr:T & match{T,__pnt{f32}}} = emit{[4]f32, '_mm_loadu_ps', mem_addr} def __loadr{mem_addr:T & match{T,__pnt{f32}}} = emit{[4]f32, '_mm_loadr_ps', mem_addr} def __loadu_2u64{mem_addr:T & match{T,__pnt{void}}} = emit{[2]u64, '_mm_loadu_si64', mem_addr} def __loadu_8u16{mem_addr:T & match{T,__pnt{void}}} = emit{[8]u16, '_mm_loadu_si16', mem_addr} def __and{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_and_ps', a, b} def __andnot{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_andnot_ps', a, b} def __or{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_or_ps', a, b} def __xor{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_xor_ps', a, b} def __movemask{a:T & match{T,[8]i8}} = emit{i32, '_mm_movemask_pi8', a} def __pmovmskb{a:T & match{T,[8]u8}} = emit{i32, '_m_pmovmskb', a} def __movemask{a:T & match{T,[4]f32}} = emit{i32, '_mm_movemask_ps', a} def __sad{a:T, b:T & match{T,[8]u8}} = emit{[4]u16, '_mm_sad_pu8', a, b} def __psadbw{a:T, b:T & match{T,[8]u8}} = emit{[4]u16, '_m_psadbw', a, b} def __movehl{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_movehl_ps', a, b} def __movelh{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_movelh_ps', a, b} def __avg{a:T, b:T & match{T,[8]u8}} = emit{T, '_mm_avg_pu8', a, b} def __pavgb{a:T, b:T & match{T,[8]u8}} = emit{T, '_m_pavgb', a, b} def __avg{a:T, b:T & match{T,[4]u16}} = emit{T, '_mm_avg_pu16', a, b} def __pavgw{a:T, b:T & match{T,[4]u16}} = emit{T, '_m_pavgw', a, b} def __set1_4f32{a:T & match{T,f32}} = emit{[4]f32, '_mm_set1_ps', a} def __set{a:T & match{T,f32}} = emit{[4]f32, '_mm_set_ps1', a} def __set{e3:T, e2:T, e1:T, e0:T & match{T,f32}} = emit{[4]f32, '_mm_set_ps', e3, e2, e1, e0} def __setr{e3:T, e2:T, e1:T, e0:T & match{T,f32}} = emit{[4]f32, '_mm_setr_ps', e3, e2, e1, e0} def __setzero_4f32{} = emit{[4]f32, '_mm_setzero_ps'} def __max{a:T, b:T & match{T,[4]i16}} = emit{T, '_mm_max_pi16', a, b} def __pmaxsw{a:T, b:T & match{T,[4]i16}} = emit{T, '_m_pmaxsw', a, b} def __max{a:T, b:T & match{T,[8]u8}} = emit{T, '_mm_max_pu8', a, b} def __pmaxub{a:T, b:T & match{T,[8]u8}} = emit{T, '_m_pmaxub', a, b} def __min{a:T, b:T & match{T,[4]i16}} = emit{T, '_mm_min_pi16', a, b} def __pminsw{a:T, b:T & match{T,[4]i16}} = emit{T, '_m_pminsw', a, b} def __min{a:T, b:T & match{T,[8]u8}} = emit{T, '_mm_min_pu8', a, b} def __pminub{a:T, b:T & match{T,[8]u8}} = emit{T, '_m_pminub', a, b} def __min{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_min_ps', a, b} def __max{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_max_ps', a, b} def __stream{mem_addr:T, a:S & match{T,__pnt{[2]f32}} & match{S,[1]i64}} = emit{void, '_mm_stream_pi', mem_addr, a} def __maskmove{a:T, mask:T, mem_addr:S & match{T,[8]u8} & match{S,__pnt{u8}}} = emit{void, '_mm_maskmove_si64', a, mask, mem_addr} def __maskmovq{a:T, mask:T, mem_addr:S & match{T,[8]u8} & match{S,__pnt{u8}}} = emit{void, '_m_maskmovq', a, mask, mem_addr} def __stream{mem_addr:T, a:S & match{T,__pnt{f32}} & match{S,[4]f32}} = emit{void, '_mm_stream_ps', mem_addr, a} def __storeh{mem_addr:T, a:S & match{T,__pnt{[2]f32}} & match{S,[4]f32}} = emit{void, '_mm_storeh_pi', mem_addr, a} def __storel{mem_addr:T, a:S & match{T,__pnt{[2]f32}} & match{S,[4]f32}} = emit{void, '_mm_storel_pi', mem_addr, a} def __store1{mem_addr:T, a:S & match{T,__pnt{f32}} & match{S,[4]f32}} = emit{void, '_mm_store1_ps', mem_addr, a} def __store_ps1{mem_addr:T, a:S & match{T,__pnt{f32}} & match{S,[4]f32}} = emit{void, '_mm_store_ps1', mem_addr, a} def __store_ps{mem_addr:T, a:S & match{T,__pnt{f32}} & match{S,[4]f32}} = emit{void, '_mm_store_ps', mem_addr, a} def __storeu{mem_addr:T, a:S & match{T,__pnt{f32}} & match{S,[4]f32}} = emit{void, '_mm_storeu_ps', mem_addr, a} def __storer{mem_addr:T, a:S & match{T,__pnt{f32}} & match{S,[4]f32}} = emit{void, '_mm_storer_ps', mem_addr, a} def __storeu{mem_addr:T, a:S & match{T,__pnt{void}} & match{S,[8]u16}} = emit{void, '_mm_storeu_si16', mem_addr, a} def __storeu{mem_addr:T, a:S & match{T,__pnt{void}} & match{S,[2]u64}} = emit{void, '_mm_storeu_si64', mem_addr, a} def __TRANSPOSE4{row0:T, row1:T, row2:T, row3:T & match{T,[4]f32}} = emit{void, '_MM_TRANSPOSE4_PS', row0, row1, row2, row3} def __extract{a:T, imm8 & match{T,[4]i16} & num{imm8}} = emit{i32, '_mm_extract_pi16', a, imm8} def __pextrw{a:T, imm8 & match{T,[4]u16} & num{imm8}} = emit{i32, '_m_pextrw', a, imm8} def __insert{a:T, i:S, imm8 & match{T,[4]i16} & match{S,i32} & num{imm8}} = emit{T, '_mm_insert_pi16', a, i, imm8} def __pinsrw{a:T, i:S, imm8 & match{T,[4]u16} & match{S,i32} & num{imm8}} = emit{T, '_m_pinsrw', a, i, imm8} def __shuffle{a:T, imm8 & match{T,[4]i16} & num{imm8}} = emit{T, '_mm_shuffle_pi16', a, imm8} def __pshufw{a:T, imm8 & match{T,[4]u16} & num{imm8}} = emit{T, '_m_pshufw', a, imm8} def __shuffle{a:T, b:T, imm8 & match{T,[4]f32} & num{imm8}} = emit{T, '_mm_shuffle_ps', a, b, imm8} def __unpackhi{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_unpackhi_ps', a, b} def __unpacklo{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_unpacklo_ps', a, b} #SSE2 def __add{a:T, b:T & match{T,[16]i8}} = emit{T, '_mm_add_epi8', a, b} def __add{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_add_epi16', a, b} def __add{a:T, b:T & match{T,[4]i32}} = emit{T, '_mm_add_epi32', a, b} def __add{a:T, b:T & match{T,[1]u64}} = emit{[2]f32, '_mm_add_si64', a, b} def __add{a:T, b:T & match{T,[2]i64}} = emit{T, '_mm_add_epi64', a, b} def __adds{a:T, b:T & match{T,[16]i8}} = emit{T, '_mm_adds_epi8', a, b} def __adds{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_adds_epi16', a, b} def __adds{a:T, b:T & match{T,[16]u8}} = emit{T, '_mm_adds_epu8', a, b} def __adds{a:T, b:T & match{T,[8]u16}} = emit{T, '_mm_adds_epu16', a, b} def __madd{a:T, b:T & match{T,[8]i16}} = emit{[4]i32, '_mm_madd_epi16', a, b} def __mulhi{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_mulhi_epi16', a, b} def __mulhi{a:T, b:T & match{T,[8]u16}} = emit{T, '_mm_mulhi_epu16', a, b} def __mullo{a:T, b:T & match{T,[8]u16}} = emit{T, '_mm_mullo_epi16', a, b} def __mul{a:T, b:T & match{T,[2]u32}} = emit{[2]f32, '_mm_mul_su32', a, b} def __mul{a:T, b:T & match{T,[4]u32}} = emit{[2]u64, '_mm_mul_epu32', a, b} def __sub{a:T, b:T & match{T,[16]i8}} = emit{T, '_mm_sub_epi8', a, b} def __sub{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_sub_epi16', a, b} def __sub{a:T, b:T & match{T,[4]i32}} = emit{T, '_mm_sub_epi32', a, b} def __sub{a:T, b:T & match{T,[1]u64}} = emit{[2]f32, '_mm_sub_si64', a, b} def __sub{a:T, b:T & match{T,[2]i64}} = emit{T, '_mm_sub_epi64', a, b} def __subs{a:T, b:T & match{T,[16]i8}} = emit{T, '_mm_subs_epi8', a, b} def __subs{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_subs_epi16', a, b} def __subs{a:T, b:T & match{T,[16]u8}} = emit{T, '_mm_subs_epu8', a, b} def __subs{a:T, b:T & match{T,[8]u16}} = emit{T, '_mm_subs_epu16', a, b} def __add{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_add_pd', a, b} def __div{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_div_pd', a, b} def __mul{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_mul_pd', a, b} def __sub{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_sub_pd', a, b} def __castpd{a:T & match{T,[2]f64}} = emit{[4]f32, '_mm_castpd_ps', a} def __castps{a:T & match{T,[4]f32}} = emit{[2]f64, '_mm_castps_pd', a} def __castsi128{a:T & match{T,[2]u64}} = emit{[2]f64, '_mm_castsi128_pd', a} def __castsi128{a:T & match{T,[4]u32}} = emit{[4]f32, '_mm_castsi128_ps', a} def __cmpeq{a:T, b:T & match{T,[16]i8}} = emit{T, '_mm_cmpeq_epi8', a, b} def __cmpeq{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_cmpeq_epi16', a, b} def __cmpeq{a:T, b:T & match{T,[4]i32}} = emit{T, '_mm_cmpeq_epi32', a, b} def __cmpgt{a:T, b:T & match{T,[16]i8}} = emit{T, '_mm_cmpgt_epi8', a, b} def __cmpgt{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_cmpgt_epi16', a, b} def __cmpgt{a:T, b:T & match{T,[4]i32}} = emit{T, '_mm_cmpgt_epi32', a, b} def __cmplt{a:T, b:T & match{T,[16]i8}} = emit{T, '_mm_cmplt_epi8', a, b} def __cmplt{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_cmplt_epi16', a, b} def __cmplt{a:T, b:T & match{T,[4]i32}} = emit{T, '_mm_cmplt_epi32', a, b} def __cmpeq{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_cmpeq_pd', a, b} def __cmplt{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_cmplt_pd', a, b} def __cmple{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_cmple_pd', a, b} def __cmpgt{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_cmpgt_pd', a, b} def __cmpge{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_cmpge_pd', a, b} def __cmpord{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_cmpord_pd', a, b} def __cmpunord{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_cmpunord_pd', a, b} def __cmpneq{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_cmpneq_pd', a, b} def __cmpnlt{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_cmpnlt_pd', a, b} def __cmpnle{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_cmpnle_pd', a, b} def __cmpngt{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_cmpngt_pd', a, b} def __cmpnge{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_cmpnge_pd', a, b} def __cvtepi32_2f64{a:T & match{T,[4]i32}} = emit{[2]f64, '_mm_cvtepi32_pd', a} def __cvtepi32_4f32{a:T & match{T,[4]i32}} = emit{[4]f32, '_mm_cvtepi32_ps', a} def __cvtpi32{a:T & match{T,[2]i32}} = emit{[2]f64, '_mm_cvtpi32_pd', a} def __cvtsi32{a:T & match{T,i32}} = emit{[4]u32, '_mm_cvtsi32_si128', a} def __cvtsi64{a:T & match{T,u64}} = emit{[2]u64, '_mm_cvtsi64_si128', a} def __cvtsi64x{a:T & match{T,u64}} = emit{[2]u64, '_mm_cvtsi64x_si128', a} def __cvtsi128{a:T & match{T,[4]u32}} = emit{i32, '_mm_cvtsi128_si32', a} def __cvtsi128_si64{a:T & match{T,[2]u64}} = emit{u64, '_mm_cvtsi128_si64', a} def __cvtsi128_si64x{a:T & match{T,[2]u64}} = emit{u64, '_mm_cvtsi128_si64x', a} def __cvtpd_4f32{a:T & match{T,[2]f64}} = emit{[4]f32, '_mm_cvtpd_ps', a} def __cvtps_2f64{a:T & match{T,[4]f32}} = emit{[2]f64, '_mm_cvtps_pd', a} def __cvtpd_4i32{a:T & match{T,[2]f64}} = emit{[4]i32, '_mm_cvtpd_epi32', a} def __cvtsd_si32{a:T & match{T,[2]f64}} = emit{i32, '_mm_cvtsd_si32', a} def __cvtsd_si64{a:T & match{T,[2]f64}} = emit{u64, '_mm_cvtsd_si64', a} def __cvtsd_si64x{a:T & match{T,[2]f64}} = emit{u64, '_mm_cvtsd_si64x', a} def __cvtsd_f64{a:T & match{T,[2]f64}} = emit{f64, '_mm_cvtsd_f64', a} def __cvttpd_4i32{a:T & match{T,[2]f64}} = emit{[4]i32, '_mm_cvttpd_epi32', a} def __cvttsd_si32{a:T & match{T,[2]f64}} = emit{i32, '_mm_cvttsd_si32', a} def __cvttsd_si64{a:T & match{T,[2]f64}} = emit{u64, '_mm_cvttsd_si64', a} def __cvttsd_si64x{a:T & match{T,[2]f64}} = emit{u64, '_mm_cvttsd_si64x', a} def __cvtps_4i32{a:T & match{T,[4]f32}} = emit{[4]i32, '_mm_cvtps_epi32', a} def __cvttps_4i32{a:T & match{T,[4]f32}} = emit{[4]i32, '_mm_cvttps_epi32', a} def __cvtpd_2f32{a:T & match{T,[2]f64}} = emit{[2]f32, '_mm_cvtpd_pi32', a} def __cvttpd_2f32{a:T & match{T,[2]f64}} = emit{[2]f32, '_mm_cvttpd_pi32', a} def __sqrt{a:T & match{T,[2]f64}} = emit{T, '_mm_sqrt_pd', a} def __undefined_2f64{} = emit{[2]f64, '_mm_undefined_pd'} def __pause{} = emit{void, '_mm_pause'} def __clflush{p:T & match{T,__pnt{void}}} = emit{void, '_mm_clflush', p} def __lfence{} = emit{void, '_mm_lfence'} def __mfence{} = emit{void, '_mm_mfence'} def __loadu_4u32{mem_addr:T & match{T,__pnt{void}}} = emit{[4]u32, '_mm_loadu_si32', mem_addr} def __loadl{mem_addr:T & match{T,__pnt{[2]i64}}} = emit{[2]i64, '_mm_loadl_epi64', mem_addr} def __load_pd_2f64{mem_addr:T & match{T,__pnt{f64}}} = emit{[2]f64, '_mm_load_pd', mem_addr} def __load1{mem_addr:T & match{T,__pnt{f64}}} = emit{[2]f64, '_mm_load1_pd', mem_addr} def __load_pd1_2f64{mem_addr:T & match{T,__pnt{f64}}} = emit{[2]f64, '_mm_load_pd1', mem_addr} def __loadr{mem_addr:T & match{T,__pnt{f64}}} = emit{[2]f64, '_mm_loadr_pd', mem_addr} def __loadu_2f64{mem_addr:T & match{T,__pnt{f64}}} = emit{[2]f64, '_mm_loadu_pd', mem_addr} def __loadh{a:T, mem_addr:S & match{T,[2]f64} & match{S,__pnt{f64}}} = emit{T, '_mm_loadh_pd', a, mem_addr} def __loadl{a:T, mem_addr:S & match{T,[2]f64} & match{S,__pnt{f64}}} = emit{T, '_mm_loadl_pd', a, mem_addr} def __and{a:T, b:T & intvec{128,T}} = emit{T, '_mm_and_si128', a, b} def __andnot{a:T, b:T & intvec{128,T}} = emit{T, '_mm_andnot_si128', a, b} def __or{a:T, b:T & intvec{128,T}} = emit{T, '_mm_or_si128', a, b} def __xor{a:T, b:T & intvec{128,T}} = emit{T, '_mm_xor_si128', a, b} def __and{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_and_pd', a, b} def __andnot{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_andnot_pd', a, b} def __or{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_or_pd', a, b} def __xor{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_xor_pd', a, b} def __movepi64{a:T & match{T,[2]i64}} = emit{[2]f32, '_mm_movepi64_pi64', a} def __packs{a:T, b:T & match{T,[8]i16}} = emit{[16]i8, '_mm_packs_epi16', a, b} def __packs{a:T, b:T & match{T,[4]i32}} = emit{[8]i16, '_mm_packs_epi32', a, b} def __packus{a:T, b:T & match{T,[8]i16}} = emit{[16]i8, '_mm_packus_epi16', a, b} def __movemask{a:T & match{T,[16]i8}} = emit{i32, '_mm_movemask_epi8', a} def __movemask{a:T & match{T,[2]f64}} = emit{i32, '_mm_movemask_pd', a} def __sad{a:T, b:T & match{T,[16]u8}} = emit{[8]u16, '_mm_sad_epu8', a, b} def __movpi64{a:T & match{T,[1]i64}} = emit{[2]i64, '_mm_movpi64_epi64', a} def __move{a:T & match{T,[2]i64}} = emit{T, '_mm_move_epi64', a} def __avg{a:T, b:T & match{T,[16]u8}} = emit{T, '_mm_avg_epu8', a, b} def __avg{a:T, b:T & match{T,[8]u16}} = emit{T, '_mm_avg_epu16', a, b} def __set{e1:T, e0:T & match{T,[1]i64}} = emit{[2]i64, '_mm_set_epi64', e1, e0} def __set{e1:T, e0:T & match{T,i64}} = emit{[2]i64, '_mm_set_epi64x', e1, e0} def __set{e3:T, e2:T, e1:T, e0:T & match{T,i32}} = emit{[4]i32, '_mm_set_epi32', e3, e2, e1, e0} def __set{e7:T, e6:T, e5:T, e4:T, e3:T, e2:T, e1:T, e0:T & match{T,i16}} = emit{[8]i16, '_mm_set_epi16', e7, e6, e5, e4, e3, e2, e1, e0} def __set{e15:T, e14:T, e13:T, e12:T, e11:T, e10:T, e9:T, e8:T, e7:T, e6:T, e5:T, e4:T, e3:T, e2:T, e1:T, e0:T & match{T,i8}} = emit{[16]i8, '_mm_set_epi8', e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0} def __set1{a:T & match{T,[1]i64}} = emit{[2]i64, '_mm_set1_epi64', a} def __set1_2i64{a:T & match{T,i64}} = emit{[2]i64, '_mm_set1_epi64x', a} def __set1_4i32{a:T & match{T,i32}} = emit{[4]i32, '_mm_set1_epi32', a} def __set1_8i16{a:T & match{T,i16}} = emit{[8]i16, '_mm_set1_epi16', a} def __set1_16i8{a:T & match{T,i8}} = emit{[16]i8, '_mm_set1_epi8', a} def __setr{e1:T, e0:T & match{T,[1]i64}} = emit{[2]i64, '_mm_setr_epi64', e1, e0} def __setr{e3:T, e2:T, e1:T, e0:T & match{T,i32}} = emit{[4]i32, '_mm_setr_epi32', e3, e2, e1, e0} def __setr{e7:T, e6:T, e5:T, e4:T, e3:T, e2:T, e1:T, e0:T & match{T,i16}} = emit{[8]i16, '_mm_setr_epi16', e7, e6, e5, e4, e3, e2, e1, e0} def __setr{e15:T, e14:T, e13:T, e12:T, e11:T, e10:T, e9:T, e8:T, e7:T, e6:T, e5:T, e4:T, e3:T, e2:T, e1:T, e0:T & match{T,i8}} = emit{[16]i8, '_mm_setr_epi8', e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0} def __set1_2f64{a:T & match{T,f64}} = emit{[2]f64, '_mm_set1_pd', a} def __set{a:T & match{T,f64}} = emit{[2]f64, '_mm_set_pd1', a} def __set{e1:T, e0:T & match{T,f64}} = emit{[2]f64, '_mm_set_pd', e1, e0} def __setr{e1:T, e0:T & match{T,f64}} = emit{[2]f64, '_mm_setr_pd', e1, e0} def __setzero_2f64{} = emit{[2]f64, '_mm_setzero_pd'} def __slli{a:T, imm8 & intvec{128,T} & num{imm8}} = emit{T, '_mm_slli_si128', a, imm8} def __bslli{a:T, imm8 & intvec{128,T} & num{imm8}} = emit{T, '_mm_bslli_si128', a, imm8} def __bsrli{a:T, imm8 & intvec{128,T} & num{imm8}} = emit{T, '_mm_bsrli_si128', a, imm8} def __slli{a:T, imm8 & match{T,[8]i16} & num{imm8}} = emit{T, '_mm_slli_epi16', a, imm8} def __sll{a:T, count:T & match{T,[8]i16}} = emit{T, '_mm_sll_epi16', a, count} def __slli{a:T, imm8 & match{T,[4]i32} & num{imm8}} = emit{T, '_mm_slli_epi32', a, imm8} def __sll{a:T, count:T & match{T,[4]i32}} = emit{T, '_mm_sll_epi32', a, count} def __slli{a:T, imm8 & match{T,[2]i64} & num{imm8}} = emit{T, '_mm_slli_epi64', a, imm8} def __sll{a:T, count:T & match{T,[2]i64}} = emit{T, '_mm_sll_epi64', a, count} def __srai{a:T, imm8 & match{T,[8]i16} & num{imm8}} = emit{T, '_mm_srai_epi16', a, imm8} def __sra{a:T, count:T & match{T,[8]i16}} = emit{T, '_mm_sra_epi16', a, count} def __srai{a:T, imm8 & match{T,[4]i32} & num{imm8}} = emit{T, '_mm_srai_epi32', a, imm8} def __sra{a:T, count:T & match{T,[4]i32}} = emit{T, '_mm_sra_epi32', a, count} def __srli{a:T, imm8 & intvec{128,T} & num{imm8}} = emit{T, '_mm_srli_si128', a, imm8} def __srli{a:T, imm8 & match{T,[8]u16} & num{imm8}} = emit{T, '_mm_srli_epi16', a, imm8} def __srl{a:T, count:T & match{T,[8]u16}} = emit{T, '_mm_srl_epi16', a, count} def __srli{a:T, imm8 & match{T,[4]u32} & num{imm8}} = emit{T, '_mm_srli_epi32', a, imm8} def __srl{a:T, count:T & match{T,[4]u32}} = emit{T, '_mm_srl_epi32', a, count} def __srli{a:T, imm8 & match{T,[2]u64} & num{imm8}} = emit{T, '_mm_srli_epi64', a, imm8} def __srl{a:T, count:T & match{T,[2]u64}} = emit{T, '_mm_srl_epi64', a, count} def __max{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_max_epi16', a, b} def __max{a:T, b:T & match{T,[16]u8}} = emit{T, '_mm_max_epu8', a, b} def __min{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_min_epi16', a, b} def __min{a:T, b:T & match{T,[16]u8}} = emit{T, '_mm_min_epu8', a, b} def __max{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_max_pd', a, b} def __min{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_min_pd', a, b} def __storeu{mem_addr:T, a:S & match{T,__pnt{void}} & match{S,[4]u32}} = emit{void, '_mm_storeu_si32', mem_addr, a} def __maskmoveu{a:T, mask:T, mem_addr:S & match{T,[16]u8} & match{S,__pnt{u8}}} = emit{void, '_mm_maskmoveu_si128', a, mask, mem_addr} def __storel{mem_addr:T, a:S & match{T,__pnt{[2]i64}} & match{S,[2]i64}} = emit{void, '_mm_storel_epi64', mem_addr, a} def __stream{mem_addr:T, a:S & match{T,__pnt{i32}} & match{S,i32}} = emit{void, '_mm_stream_si32', mem_addr, a} def __stream{mem_addr:T, a:S & match{T,__pnt{u64}} & match{S,u64}} = emit{void, '_mm_stream_si64', mem_addr, a} def __stream{mem_addr:T, a:S & match{T,__pnt{f64}} & match{S,[2]f64}} = emit{void, '_mm_stream_pd', mem_addr, a} def __store1{mem_addr:T, a:S & match{T,__pnt{f64}} & match{S,[2]f64}} = emit{void, '_mm_store1_pd', mem_addr, a} def __store_pd1{mem_addr:T, a:S & match{T,__pnt{f64}} & match{S,[2]f64}} = emit{void, '_mm_store_pd1', mem_addr, a} def __store_pd{mem_addr:T, a:S & match{T,__pnt{f64}} & match{S,[2]f64}} = emit{void, '_mm_store_pd', mem_addr, a} def __storeu{mem_addr:T, a:S & match{T,__pnt{f64}} & match{S,[2]f64}} = emit{void, '_mm_storeu_pd', mem_addr, a} def __storer{mem_addr:T, a:S & match{T,__pnt{f64}} & match{S,[2]f64}} = emit{void, '_mm_storer_pd', mem_addr, a} def __storeh{mem_addr:T, a:S & match{T,__pnt{f64}} & match{S,[2]f64}} = emit{void, '_mm_storeh_pd', mem_addr, a} def __storel{mem_addr:T, a:S & match{T,__pnt{f64}} & match{S,[2]f64}} = emit{void, '_mm_storel_pd', mem_addr, a} def __extract{a:T, imm8 & match{T,[8]i16} & num{imm8}} = emit{i32, '_mm_extract_epi16', a, imm8} def __insert{a:T, i:S, imm8 & match{T,[8]i16} & match{S,i32} & num{imm8}} = emit{T, '_mm_insert_epi16', a, i, imm8} def __shuffle{a:T, imm8 & match{T,[4]i32} & num{imm8}} = emit{T, '_mm_shuffle_epi32', a, imm8} def __shufflehi{a:T, imm8 & match{T,[8]i16} & num{imm8}} = emit{T, '_mm_shufflehi_epi16', a, imm8} def __shufflelo{a:T, imm8 & match{T,[8]i16} & num{imm8}} = emit{T, '_mm_shufflelo_epi16', a, imm8} def __unpackhi{a:T, b:T & match{T,[16]i8}} = emit{T, '_mm_unpackhi_epi8', a, b} def __unpackhi{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_unpackhi_epi16', a, b} def __unpackhi{a:T, b:T & match{T,[4]i32}} = emit{T, '_mm_unpackhi_epi32', a, b} def __unpackhi{a:T, b:T & match{T,[2]i64}} = emit{T, '_mm_unpackhi_epi64', a, b} def __unpacklo{a:T, b:T & match{T,[16]i8}} = emit{T, '_mm_unpacklo_epi8', a, b} def __unpacklo{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_unpacklo_epi16', a, b} def __unpacklo{a:T, b:T & match{T,[4]i32}} = emit{T, '_mm_unpacklo_epi32', a, b} def __unpacklo{a:T, b:T & match{T,[2]i64}} = emit{T, '_mm_unpacklo_epi64', a, b} def __unpackhi{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_unpackhi_pd', a, b} def __unpacklo{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_unpacklo_pd', a, b} def __shuffle{a:T, b:T, imm8 & match{T,[2]f64} & num{imm8}} = emit{T, '_mm_shuffle_pd', a, b, imm8} #SSE3 def __addsub{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_addsub_ps', a, b} def __addsub{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_addsub_pd', a, b} def __hadd{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_hadd_pd', a, b} def __hadd{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_hadd_ps', a, b} def __hsub{a:T, b:T & match{T,[2]f64}} = emit{T, '_mm_hsub_pd', a, b} def __hsub{a:T, b:T & match{T,[4]f32}} = emit{T, '_mm_hsub_ps', a, b} def __loaddup{mem_addr:T & match{T,__pnt{f64}}} = emit{[2]f64, '_mm_loaddup_pd', mem_addr} def __movedup{a:T & match{T,[2]f64}} = emit{T, '_mm_movedup_pd', a} def __movehdup{a:T & match{T,[4]f32}} = emit{T, '_mm_movehdup_ps', a} def __moveldup{a:T & match{T,[4]f32}} = emit{T, '_mm_moveldup_ps', a} #SSSE3 def __hadd{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_hadd_epi16', a, b} def __hadds{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_hadds_epi16', a, b} def __hadd{a:T, b:T & match{T,[4]i32}} = emit{T, '_mm_hadd_epi32', a, b} def __hadd{a:T, b:T & match{T,[4]i16}} = emit{T, '_mm_hadd_pi16', a, b} def __hadd{a:T, b:T & match{T,[2]i32}} = emit{T, '_mm_hadd_pi32', a, b} def __hadds{a:T, b:T & match{T,[4]i16}} = emit{T, '_mm_hadds_pi16', a, b} def __hsub{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_hsub_epi16', a, b} def __hsubs{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_hsubs_epi16', a, b} def __hsub{a:T, b:T & match{T,[4]i32}} = emit{T, '_mm_hsub_epi32', a, b} def __hsub{a:T, b:T & match{T,[4]i16}} = emit{T, '_mm_hsub_pi16', a, b} def __hsub{a:T, b:T & match{T,[2]i32}} = emit{T, '_mm_hsub_pi32', a, b} def __hsubs{a:T, b:T & match{T,[4]i16}} = emit{T, '_mm_hsubs_pi16', a, b} def __maddubs{a:T, b:T & match{T,[16]i8}} = emit{[8]i16, '_mm_maddubs_epi16', a, b} def __maddubs{a:T, b:T & match{T,[8]i8}} = emit{[4]i16, '_mm_maddubs_pi16', a, b} def __mulhrs{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_mulhrs_epi16', a, b} def __mulhrs{a:T, b:T & match{T,[4]i16}} = emit{T, '_mm_mulhrs_pi16', a, b} def __sign{a:T, b:T & match{T,[16]i8}} = emit{T, '_mm_sign_epi8', a, b} def __sign{a:T, b:T & match{T,[8]i16}} = emit{T, '_mm_sign_epi16', a, b} def __sign{a:T, b:T & match{T,[4]i32}} = emit{T, '_mm_sign_epi32', a, b} def __sign{a:T, b:T & match{T,[8]i8}} = emit{T, '_mm_sign_pi8', a, b} def __sign{a:T, b:T & match{T,[4]i16}} = emit{T, '_mm_sign_pi16', a, b} def __sign{a:T, b:T & match{T,[2]i32}} = emit{T, '_mm_sign_pi32', a, b} def __alignr{a:T, b:T, imm8 & match{T,[16]i8} & num{imm8}} = emit{T, '_mm_alignr_epi8', a, b, imm8} def __alignr{a:T, b:T, imm8 & match{T,[8]i8} & num{imm8}} = emit{T, '_mm_alignr_pi8', a, b, imm8} def __abs{a:T & match{T,[8]i8}} = emit{[8]u8, '_mm_abs_pi8', a} def __abs{a:T & match{T,[16]i8}} = emit{[16]u8, '_mm_abs_epi8', a} def __abs{a:T & match{T,[4]i16}} = emit{[4]u16, '_mm_abs_pi16', a} def __abs{a:T & match{T,[8]i16}} = emit{[8]u16, '_mm_abs_epi16', a} def __abs{a:T & match{T,[2]i32}} = emit{[2]u32, '_mm_abs_pi32', a} def __abs{a:T & match{T,[4]i32}} = emit{[4]u32, '_mm_abs_epi32', a} def __shuffle{a:T, b:T & match{T,[16]i8}} = emit{T, '_mm_shuffle_epi8', a, b} def __shuffle{a:T, b:T & match{T,[8]i8}} = emit{T, '_mm_shuffle_pi8', a, b} #SSE4.1 def __dp{a:T, b:T, imm8 & match{T,[2]f64} & num{imm8}} = emit{T, '_mm_dp_pd', a, b, imm8} def __dp{a:T, b:T, imm8 & match{T,[4]f32} & num{imm8}} = emit{T, '_mm_dp_ps', a, b, imm8} def __mul{a:T, b:T & match{T,[4]i32}} = emit{[2]i64, '_mm_mul_epi32', a, b} def __mullo{a:T, b:T & match{T,[4]u32}} = emit{T, '_mm_mullo_epi32', a, b} def __cmpeq{a:T, b:T & match{T,[2]i64}} = emit{T, '_mm_cmpeq_epi64', a, b} def __cvtepi8_8i16{a:T & match{T,[16]i8}} = emit{[8]i16, '_mm_cvtepi8_epi16', a} def __cvtepi8_4i32{a:T & match{T,[16]i8}} = emit{[4]i32, '_mm_cvtepi8_epi32', a} def __cvtepi8_2i64{a:T & match{T,[16]i8}} = emit{[2]i64, '_mm_cvtepi8_epi64', a} def __cvtepi16_4i32{a:T & match{T,[8]i16}} = emit{[4]i32, '_mm_cvtepi16_epi32', a} def __cvtepi16_2i64{a:T & match{T,[8]i16}} = emit{[2]i64, '_mm_cvtepi16_epi64', a} def __cvtepi32_2i64{a:T & match{T,[4]i32}} = emit{[2]i64, '_mm_cvtepi32_epi64', a} def __cvtepu8_8i16{a:T & match{T,[16]i8}} = emit{[8]i16, '_mm_cvtepu8_epi16', a} def __cvtepu8_4i32{a:T & match{T,[16]i8}} = emit{[4]i32, '_mm_cvtepu8_epi32', a} def __cvtepu8_2i64{a:T & match{T,[16]i8}} = emit{[2]i64, '_mm_cvtepu8_epi64', a} def __cvtepu16_4i32{a:T & match{T,[8]i16}} = emit{[4]i32, '_mm_cvtepu16_epi32', a} def __cvtepu16_2i64{a:T & match{T,[8]i16}} = emit{[2]i64, '_mm_cvtepu16_epi64', a} def __cvtepu32_2i64{a:T & match{T,[4]i32}} = emit{[2]i64, '_mm_cvtepu32_epi64', a} def __testz{a:T, b:T & intvec{128,T}} = emit{i32, '_mm_testz_si128', a, b} def __testc{a:T, b:T & intvec{128,T}} = emit{i32, '_mm_testc_si128', a, b} def __testnzc{a:T, b:T & intvec{128,T}} = emit{i32, '_mm_testnzc_si128', a, b} def __test_all_zeros{a:T, mask:T & intvec{128,T}} = emit{i32, '_mm_test_all_zeros', a, mask} def __test_mix_ones_zeros{a:T, mask:T & intvec{128,T}} = emit{i32, '_mm_test_mix_ones_zeros', a, mask} def __test{a:T & intvec{128,T}} = emit{i32, '_mm_test_all_ones', a} def __minpos{a:T & match{T,[8]u16}} = emit{T, '_mm_minpos_epu16', a} def __mpsadbw{a:T, b:T, imm8 & match{T,[16]u8} & num{imm8}} = emit{T, '_mm_mpsadbw_epu8', a, b, imm8} def __packus{a:T, b:T & match{T,[4]i32}} = emit{[8]i16, '_mm_packus_epi32', a, b} def __max{a:T, b:T & match{T,[16]i8}} = emit{T, '_mm_max_epi8', a, b} def __max{a:T, b:T & match{T,[4]i32}} = emit{T, '_mm_max_epi32', a, b} def __max{a:T, b:T & match{T,[4]u32}} = emit{T, '_mm_max_epu32', a, b} def __max{a:T, b:T & match{T,[8]u16}} = emit{T, '_mm_max_epu16', a, b} def __min{a:T, b:T & match{T,[16]i8}} = emit{T, '_mm_min_epi8', a, b} def __min{a:T, b:T & match{T,[4]i32}} = emit{T, '_mm_min_epi32', a, b} def __min{a:T, b:T & match{T,[4]u32}} = emit{T, '_mm_min_epu32', a, b} def __min{a:T, b:T & match{T,[8]u16}} = emit{T, '_mm_min_epu16', a, b} def __round{a:T, rounding & match{T,[2]f64} & num{rounding}} = emit{T, '_mm_round_pd', a, rounding} def __floor{a:T & match{T,[2]f64}} = emit{T, '_mm_floor_pd', a} def __ceil{a:T & match{T,[2]f64}} = emit{T, '_mm_ceil_pd', a} def __round{a:T, rounding & match{T,[4]f32} & num{rounding}} = emit{T, '_mm_round_ps', a, rounding} def __floor{a:T & match{T,[4]f32}} = emit{T, '_mm_floor_ps', a} def __ceil{a:T & match{T,[4]f32}} = emit{T, '_mm_ceil_ps', a} def __blend{a:T, b:T, imm8 & match{T,[2]f64} & num{imm8}} = emit{T, '_mm_blend_pd', a, b, imm8} def __blend{a:T, b:T, imm8 & match{T,[4]f32} & num{imm8}} = emit{T, '_mm_blend_ps', a, b, imm8} def __blendv{a:T, b:T, mask:T & match{T,[2]f64}} = emit{T, '_mm_blendv_pd', a, b, mask} def __blendv{a:T, b:T, mask:T & match{T,[4]f32}} = emit{T, '_mm_blendv_ps', a, b, mask} def __blendv{a:T, b:T, mask:T & match{T,[16]i8}} = emit{T, '_mm_blendv_epi8', a, b, mask} def __blend{a:T, b:T, imm8 & match{T,[8]i16} & num{imm8}} = emit{T, '_mm_blend_epi16', a, b, imm8} def __extract{a:T, imm8 & match{T,[4]f32} & num{imm8}} = emit{i32, '_mm_extract_ps', a, imm8} def __extract{a:T, imm8 & match{T,[16]i8} & num{imm8}} = emit{i32, '_mm_extract_epi8', a, imm8} def __extract{a:T, imm8 & match{T,[4]i32} & num{imm8}} = emit{i32, '_mm_extract_epi32', a, imm8} def __extract{a:T, imm8 & match{T,[2]i64} & num{imm8}} = emit{i64, '_mm_extract_epi64', a, imm8} def __insert{a:T, b:T, imm8 & match{T,[4]f32} & num{imm8}} = emit{T, '_mm_insert_ps', a, b, imm8} def __insert{a:T, i:S, imm8 & match{T,[16]i8} & match{S,i32} & num{imm8}} = emit{T, '_mm_insert_epi8', a, i, imm8} def __insert{a:T, i:S, imm8 & match{T,[4]i32} & match{S,i32} & num{imm8}} = emit{T, '_mm_insert_epi32', a, i, imm8} def __insert{a:T, i:S, imm8 & match{T,[2]i64} & match{S,i64} & num{imm8}} = emit{T, '_mm_insert_epi64', a, i, imm8} #SSE4.2 def __cmpgt{a:T, b:T & match{T,[2]i64}} = emit{T, '_mm_cmpgt_epi64', a, b} def __crc32{crc:T, v:S & match{T,u32} & match{S,u8}} = emit{T, '_mm_crc32_u8', crc, v} def __crc32{crc:T, v:S & match{T,u32} & match{S,u16}} = emit{T, '_mm_crc32_u16', crc, v} def __crc32{crc:T, v:T & match{T,u32}} = emit{T, '_mm_crc32_u32', crc, v} def __crc32{crc:T, v:T & match{T,u64}} = emit{T, '_mm_crc32_u64', crc, v} def __cmpistrm{a:T, b:T, imm8 & intvec{128,T} & num{imm8}} = emit{T, '_mm_cmpistrm', a, b, imm8} def __cmpistri{a:T, b:T, imm8 & intvec{128,T} & num{imm8}} = emit{i32, '_mm_cmpistri', a, b, imm8} def __cmpistrz{a:T, b:T, imm8 & intvec{128,T} & num{imm8}} = emit{i32, '_mm_cmpistrz', a, b, imm8} def __cmpistrc{a:T, b:T, imm8 & intvec{128,T} & num{imm8}} = emit{i32, '_mm_cmpistrc', a, b, imm8} def __cmpistrs{a:T, b:T, imm8 & intvec{128,T} & num{imm8}} = emit{i32, '_mm_cmpistrs', a, b, imm8} def __cmpistro{a:T, b:T, imm8 & intvec{128,T} & num{imm8}} = emit{i32, '_mm_cmpistro', a, b, imm8} def __cmpistra{a:T, b:T, imm8 & intvec{128,T} & num{imm8}} = emit{i32, '_mm_cmpistra', a, b, imm8} def __cmpestrm{a:T, la:S, b:T, lb:S, imm8 & intvec{128,T} & match{S,i32} & num{imm8}} = emit{T, '_mm_cmpestrm', a, la, b, lb, imm8} def __cmpestri{a:T, la:S, b:T, lb:S, imm8 & intvec{128,T} & match{S,i32} & num{imm8}} = emit{S, '_mm_cmpestri', a, la, b, lb, imm8} def __cmpestrz{a:T, la:S, b:T, lb:S, imm8 & intvec{128,T} & match{S,i32} & num{imm8}} = emit{S, '_mm_cmpestrz', a, la, b, lb, imm8} def __cmpestrc{a:T, la:S, b:T, lb:S, imm8 & intvec{128,T} & match{S,i32} & num{imm8}} = emit{S, '_mm_cmpestrc', a, la, b, lb, imm8} def __cmpestrs{a:T, la:S, b:T, lb:S, imm8 & intvec{128,T} & match{S,i32} & num{imm8}} = emit{S, '_mm_cmpestrs', a, la, b, lb, imm8} def __cmpestro{a:T, la:S, b:T, lb:S, imm8 & intvec{128,T} & match{S,i32} & num{imm8}} = emit{S, '_mm_cmpestro', a, la, b, lb, imm8} def __cmpestra{a:T, la:S, b:T, lb:S, imm8 & intvec{128,T} & match{S,i32} & num{imm8}} = emit{S, '_mm_cmpestra', a, la, b, lb, imm8} #AVX def __add{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_add_pd', a, b} def __add{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_add_ps', a, b} def __addsub{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_addsub_pd', a, b} def __addsub{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_addsub_ps', a, b} def __div{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_div_pd', a, b} def __div{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_div_ps', a, b} def __dp{a:T, b:T, imm8 & match{T,[8]f32} & num{imm8}} = emit{T, '_mm256_dp_ps', a, b, imm8} def __hadd{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_hadd_pd', a, b} def __hadd{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_hadd_ps', a, b} def __hsub{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_hsub_pd', a, b} def __hsub{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_hsub_ps', a, b} def __mul{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_mul_pd', a, b} def __mul{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_mul_ps', a, b} def __sub{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_sub_pd', a, b} def __sub{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_sub_ps', a, b} def __castpd{a:T & match{T,[4]f64}} = emit{[8]f32, '_mm256_castpd_ps', a} def __castps{a:T & match{T,[8]f32}} = emit{[4]f64, '_mm256_castps_pd', a} def __castsi256{a:T & match{T,[8]u32}} = emit{[8]f32, '_mm256_castsi256_ps', a} def __castsi256{a:T & match{T,[4]u64}} = emit{[4]f64, '_mm256_castsi256_pd', a} def __castps256{a:T & match{T,[8]f32}} = emit{[4]f32, '_mm256_castps256_ps128', a} def __castpd256{a:T & match{T,[4]f64}} = emit{[2]f64, '_mm256_castpd256_pd128', a} def __castps128{a:T & match{T,[4]f32}} = emit{[8]f32, '_mm256_castps128_ps256', a} def __castpd128{a:T & match{T,[2]f64}} = emit{[4]f64, '_mm256_castpd128_pd256', a} def __zextps128{a:T & match{T,[4]f32}} = emit{[8]f32, '_mm256_zextps128_ps256', a} def __zextpd128{a:T & match{T,[2]f64}} = emit{[4]f64, '_mm256_zextpd128_pd256', a} def __cmp{a:T, b:T, imm8 & match{T,[2]f64} & num{imm8}} = emit{T, '_mm_cmp_pd', a, b, imm8} def __cmp{a:T, b:T, imm8 & match{T,[4]f64} & num{imm8}} = emit{T, '_mm256_cmp_pd', a, b, imm8} def __cmp{a:T, b:T, imm8 & match{T,[4]f32} & num{imm8}} = emit{T, '_mm_cmp_ps', a, b, imm8} def __cmp{a:T, b:T, imm8 & match{T,[8]f32} & num{imm8}} = emit{T, '_mm256_cmp_ps', a, b, imm8} def __cvtepi32_4f64{a:T & match{T,[4]i32}} = emit{[4]f64, '_mm256_cvtepi32_pd', a} def __cvtepi32{a:T & match{T,[8]i32}} = emit{[8]f32, '_mm256_cvtepi32_ps', a} def __cvtpd_4f32{a:T & match{T,[4]f64}} = emit{[4]f32, '_mm256_cvtpd_ps', a} def __cvtps{a:T & match{T,[8]f32}} = emit{[8]i32, '_mm256_cvtps_epi32', a} def __cvtps_4f64{a:T & match{T,[4]f32}} = emit{[4]f64, '_mm256_cvtps_pd', a} def __cvttpd{a:T & match{T,[4]f64}} = emit{[4]i32, '_mm256_cvttpd_epi32', a} def __cvtpd_4i32{a:T & match{T,[4]f64}} = emit{[4]i32, '_mm256_cvtpd_epi32', a} def __cvttps{a:T & match{T,[8]f32}} = emit{[8]i32, '_mm256_cvttps_epi32', a} def __cvtss{a:T & match{T,[8]f32}} = emit{f32, '_mm256_cvtss_f32', a} def __cvtsd{a:T & match{T,[4]f64}} = emit{f64, '_mm256_cvtsd_f64', a} def __cvtsi256{a:T & match{T,[8]u32}} = emit{i32, '_mm256_cvtsi256_si32', a} def __rcp{a:T & match{T,[8]f32}} = emit{T, '_mm256_rcp_ps', a} def __rsqrt{a:T & match{T,[8]f32}} = emit{T, '_mm256_rsqrt_ps', a} def __sqrt{a:T & match{T,[4]f64}} = emit{T, '_mm256_sqrt_pd', a} def __sqrt{a:T & match{T,[8]f32}} = emit{T, '_mm256_sqrt_ps', a} def __zeroall{} = emit{void, '_mm256_zeroall'} def __zeroupper{} = emit{void, '_mm256_zeroupper'} def __undefined_8f32{} = emit{[8]f32, '_mm256_undefined_ps'} def __undefined_4f64{} = emit{[4]f64, '_mm256_undefined_pd'} def __load_pd_4f64{mem_addr:T & match{T,__pnt{f64}}} = emit{[4]f64, '_mm256_load_pd', mem_addr} def __load_ps_8f32{mem_addr:T & match{T,__pnt{f32}}} = emit{[8]f32, '_mm256_load_ps', mem_addr} def __loadu_4f64{mem_addr:T & match{T,__pnt{f64}}} = emit{[4]f64, '_mm256_loadu_pd', mem_addr} def __loadu_8f32{mem_addr:T & match{T,__pnt{f32}}} = emit{[8]f32, '_mm256_loadu_ps', mem_addr} def __maskload{mem_addr:T, mask:S & match{T,__pnt{f64}} & intvec{256,S}} = emit{[4]f64, '_mm256_maskload_pd', mem_addr, mask} def __maskload{mem_addr:T, mask:S & match{T,__pnt{f64}} & intvec{128,S}} = emit{[2]f64, '_mm_maskload_pd', mem_addr, mask} def __maskload{mem_addr:T, mask:S & match{T,__pnt{f32}} & intvec{256,S}} = emit{[8]f32, '_mm256_maskload_ps', mem_addr, mask} def __maskload{mem_addr:T, mask:S & match{T,__pnt{f32}} & intvec{128,S}} = emit{[4]f32, '_mm_maskload_ps', mem_addr, mask} def __loadu2{hiaddr:T, loaddr:T & match{T,__pnt{f32}}} = emit{[8]f32, '_mm256_loadu2_m128', hiaddr, loaddr} def __loadu2{hiaddr:T, loaddr:T & match{T,__pnt{f64}}} = emit{[4]f64, '_mm256_loadu2_m128d', hiaddr, loaddr} def __and{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_and_pd', a, b} def __and{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_and_ps', a, b} def __andnot{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_andnot_pd', a, b} def __andnot{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_andnot_ps', a, b} def __or{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_or_pd', a, b} def __or{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_or_ps', a, b} def __xor{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_xor_pd', a, b} def __xor{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_xor_ps', a, b} def __testz{a:T, b:T & intvec{256,T}} = emit{i32, '_mm256_testz_si256', a, b} def __testc{a:T, b:T & intvec{256,T}} = emit{i32, '_mm256_testc_si256', a, b} def __testnzc{a:T, b:T & intvec{256,T}} = emit{i32, '_mm256_testnzc_si256', a, b} def __testz{a:T, b:T & match{T,[4]f64}} = emit{i32, '_mm256_testz_pd', a, b} def __testc{a:T, b:T & match{T,[4]f64}} = emit{i32, '_mm256_testc_pd', a, b} def __testnzc{a:T, b:T & match{T,[4]f64}} = emit{i32, '_mm256_testnzc_pd', a, b} def __testz{a:T, b:T & match{T,[2]f64}} = emit{i32, '_mm_testz_pd', a, b} def __testc{a:T, b:T & match{T,[2]f64}} = emit{i32, '_mm_testc_pd', a, b} def __testnzc{a:T, b:T & match{T,[2]f64}} = emit{i32, '_mm_testnzc_pd', a, b} def __testz{a:T, b:T & match{T,[8]f32}} = emit{i32, '_mm256_testz_ps', a, b} def __testc{a:T, b:T & match{T,[8]f32}} = emit{i32, '_mm256_testc_ps', a, b} def __testnzc{a:T, b:T & match{T,[8]f32}} = emit{i32, '_mm256_testnzc_ps', a, b} def __testz{a:T, b:T & match{T,[4]f32}} = emit{i32, '_mm_testz_ps', a, b} def __testc{a:T, b:T & match{T,[4]f32}} = emit{i32, '_mm_testc_ps', a, b} def __testnzc{a:T, b:T & match{T,[4]f32}} = emit{i32, '_mm_testnzc_ps', a, b} def __movemask{a:T & match{T,[4]f64}} = emit{i32, '_mm256_movemask_pd', a} def __movemask{a:T & match{T,[8]f32}} = emit{i32, '_mm256_movemask_ps', a} def __movehdup{a:T & match{T,[8]f32}} = emit{T, '_mm256_movehdup_ps', a} def __moveldup{a:T & match{T,[8]f32}} = emit{T, '_mm256_moveldup_ps', a} def __movedup{a:T & match{T,[4]f64}} = emit{T, '_mm256_movedup_pd', a} def __setzero_4f64{} = emit{[4]f64, '_mm256_setzero_pd'} def __setzero_8f32{} = emit{[8]f32, '_mm256_setzero_ps'} def __set{e3:T, e2:T, e1:T, e0:T & match{T,f64}} = emit{[4]f64, '_mm256_set_pd', e3, e2, e1, e0} def __set{e7:T, e6:T, e5:T, e4:T, e3:T, e2:T, e1:T, e0:T & match{T,f32}} = emit{[8]f32, '_mm256_set_ps', e7, e6, e5, e4, e3, e2, e1, e0} def __set{e31:T, e30:T, e29:T, e28:T, e27:T, e26:T, e25:T, e24:T, e23:T, e22:T, e21:T, e20:T, e19:T, e18:T, e17:T, e16:T, e15:T, e14:T, e13:T, e12:T, e11:T, e10:T, e9:T, e8:T, e7:T, e6:T, e5:T, e4:T, e3:T, e2:T, e1:T, e0:T & match{T,i8}} = emit{[32]i8, '_mm256_set_epi8', e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0} def __set{e15:T, e14:T, e13:T, e12:T, e11:T, e10:T, e9:T, e8:T, e7:T, e6:T, e5:T, e4:T, e3:T, e2:T, e1:T, e0:T & match{T,i16}} = emit{[16]i16, '_mm256_set_epi16', e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0} def __set{e7:T, e6:T, e5:T, e4:T, e3:T, e2:T, e1:T, e0:T & match{T,i32}} = emit{[8]i32, '_mm256_set_epi32', e7, e6, e5, e4, e3, e2, e1, e0} def __set{e3:T, e2:T, e1:T, e0:T & match{T,i64}} = emit{[4]i64, '_mm256_set_epi64x', e3, e2, e1, e0} def __setr{e3:T, e2:T, e1:T, e0:T & match{T,f64}} = emit{[4]f64, '_mm256_setr_pd', e3, e2, e1, e0} def __setr{e7:T, e6:T, e5:T, e4:T, e3:T, e2:T, e1:T, e0:T & match{T,f32}} = emit{[8]f32, '_mm256_setr_ps', e7, e6, e5, e4, e3, e2, e1, e0} def __setr{e31:T, e30:T, e29:T, e28:T, e27:T, e26:T, e25:T, e24:T, e23:T, e22:T, e21:T, e20:T, e19:T, e18:T, e17:T, e16:T, e15:T, e14:T, e13:T, e12:T, e11:T, e10:T, e9:T, e8:T, e7:T, e6:T, e5:T, e4:T, e3:T, e2:T, e1:T, e0:T & match{T,i8}} = emit{[32]i8, '_mm256_setr_epi8', e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0} def __setr{e15:T, e14:T, e13:T, e12:T, e11:T, e10:T, e9:T, e8:T, e7:T, e6:T, e5:T, e4:T, e3:T, e2:T, e1:T, e0:T & match{T,i16}} = emit{[16]i16, '_mm256_setr_epi16', e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0} def __setr{e7:T, e6:T, e5:T, e4:T, e3:T, e2:T, e1:T, e0:T & match{T,i32}} = emit{[8]i32, '_mm256_setr_epi32', e7, e6, e5, e4, e3, e2, e1, e0} def __setr{e3:T, e2:T, e1:T, e0:T & match{T,i64}} = emit{[4]i64, '_mm256_setr_epi64x', e3, e2, e1, e0} def __set1_4f64{a:T & match{T,f64}} = emit{[4]f64, '_mm256_set1_pd', a} def __set1_8f32{a:T & match{T,f32}} = emit{[8]f32, '_mm256_set1_ps', a} def __set1_32i8{a:T & match{T,i8}} = emit{[32]i8, '_mm256_set1_epi8', a} def __set1_16i16{a:T & match{T,i16}} = emit{[16]i16, '_mm256_set1_epi16', a} def __set1_8i32{a:T & match{T,i32}} = emit{[8]i32, '_mm256_set1_epi32', a} def __set1_4i64{a:T & match{T,i64}} = emit{[4]i64, '_mm256_set1_epi64x', a} def __set{hi:T, lo:T & match{T,[4]f32}} = emit{[8]f32, '_mm256_set_m128', hi, lo} def __set{hi:T, lo:T & match{T,[2]f64}} = emit{[4]f64, '_mm256_set_m128d', hi, lo} def __setr{lo:T, hi:T & match{T,[4]f32}} = emit{[8]f32, '_mm256_setr_m128', lo, hi} def __setr{lo:T, hi:T & match{T,[2]f64}} = emit{[4]f64, '_mm256_setr_m128d', lo, hi} def __max{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_max_pd', a, b} def __max{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_max_ps', a, b} def __min{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_min_pd', a, b} def __min{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_min_ps', a, b} def __round{a:T, rounding & match{T,[4]f64} & num{rounding}} = emit{T, '_mm256_round_pd', a, rounding} def __round{a:T, rounding & match{T,[8]f32} & num{rounding}} = emit{T, '_mm256_round_ps', a, rounding} def __floor{a:T & match{T,[8]f32}} = emit{T, '_mm256_floor_ps', a} def __ceil{a:T & match{T,[8]f32}} = emit{T, '_mm256_ceil_ps', a} def __floor{a:T & match{T,[4]f64}} = emit{T, '_mm256_floor_pd', a} def __ceil{a:T & match{T,[4]f64}} = emit{T, '_mm256_ceil_pd', a} def __store{mem_addr:T, a:S & match{T,__pnt{f64}} & match{S,[4]f64}} = emit{void, '_mm256_store_pd', mem_addr, a} def __store{mem_addr:T, a:S & match{T,__pnt{f32}} & match{S,[8]f32}} = emit{void, '_mm256_store_ps', mem_addr, a} def __storeu{mem_addr:T, a:S & match{T,__pnt{f64}} & match{S,[4]f64}} = emit{void, '_mm256_storeu_pd', mem_addr, a} def __storeu{mem_addr:T, a:S & match{T,__pnt{f32}} & match{S,[8]f32}} = emit{void, '_mm256_storeu_ps', mem_addr, a} def __maskstore{mem_addr:T, mask:S, a:R & match{T,__pnt{f64}} & intvec{256,S} & match{R,[4]f64}} = emit{void, '_mm256_maskstore_pd', mem_addr, mask, a} def __maskstore{mem_addr:T, mask:S, a:R & match{T,__pnt{f64}} & intvec{128,S} & match{R,[2]f64}} = emit{void, '_mm_maskstore_pd', mem_addr, mask, a} def __maskstore{mem_addr:T, mask:S, a:R & match{T,__pnt{f32}} & intvec{256,S} & match{R,[8]f32}} = emit{void, '_mm256_maskstore_ps', mem_addr, mask, a} def __maskstore{mem_addr:T, mask:S, a:R & match{T,__pnt{f32}} & intvec{128,S} & match{R,[4]f32}} = emit{void, '_mm_maskstore_ps', mem_addr, mask, a} def __stream{mem_addr:T, a:S & match{T,__pnt{f64}} & match{S,[4]f64}} = emit{void, '_mm256_stream_pd', mem_addr, a} def __stream{mem_addr:T, a:S & match{T,__pnt{f32}} & match{S,[8]f32}} = emit{void, '_mm256_stream_ps', mem_addr, a} def __storeu2{hiaddr:T, loaddr:T, a:S & match{T,__pnt{f32}} & match{S,[8]f32}} = emit{void, '_mm256_storeu2_m128', hiaddr, loaddr, a} def __storeu2{hiaddr:T, loaddr:T, a:S & match{T,__pnt{f64}} & match{S,[4]f64}} = emit{void, '_mm256_storeu2_m128d', hiaddr, loaddr, a} def __blend{a:T, b:T, imm8 & match{T,[4]f64} & num{imm8}} = emit{T, '_mm256_blend_pd', a, b, imm8} def __blend{a:T, b:T, imm8 & match{T,[8]f32} & num{imm8}} = emit{T, '_mm256_blend_ps', a, b, imm8} def __blendv{a:T, b:T, mask:S & match{T,[4]f64} & intvec{256,S}} = emit{T, '_mm256_blendv_pd', a, b, mask} def __blendv{a:T, b:T, mask:S & match{T,[8]f32} & intvec{256,S}} = emit{T, '_mm256_blendv_ps', a, b, mask} def __shuffle{a:T, b:T, imm8 & match{T,[4]f64} & num{imm8}} = emit{T, '_mm256_shuffle_pd', a, b, imm8} def __shuffle{a:T, b:T, imm8 & match{T,[8]f32} & num{imm8}} = emit{T, '_mm256_shuffle_ps', a, b, imm8} def __extractf128{a:T, imm8 & match{T,[8]f32} & num{imm8}} = emit{[4]f32, '_mm256_extractf128_ps', a, imm8} def __extractf128{a:T, imm8 & match{T,[4]f64} & num{imm8}} = emit{[2]f64, '_mm256_extractf128_pd', a, imm8} def __extract{a:T, index & match{T,[8]i32} & num{index}} = emit{i32, '_mm256_extract_epi32', a, index} def __extract{a:T, index & match{T,[4]i64} & num{index}} = emit{i64, '_mm256_extract_epi64', a, index} def __permutevar{a:T, b:S & match{T,[8]f32} & match{S,[8]u32}} = emit{T, '_mm256_permutevar_ps', a, b} def __permutevar{a:T, b:S & match{T,[4]f32} & match{S,[4]u32}} = emit{T, '_mm_permutevar_ps', a, b} def __permute{a:T, imm8 & match{T,[8]f32} & num{imm8}} = emit{T, '_mm256_permute_ps', a, imm8} def __permute{a:T, imm8 & match{T,[4]f32} & num{imm8}} = emit{T, '_mm_permute_ps', a, imm8} def __permutevar{a:T, b:S & match{T,[4]f64} & match{S,[4]u64}} = emit{T, '_mm256_permutevar_pd', a, b} def __permutevar{a:T, b:S & match{T,[2]f64} & match{S,[2]u64}} = emit{T, '_mm_permutevar_pd', a, b} def __permute{a:T, imm8 & match{T,[4]f64} & num{imm8}} = emit{T, '_mm256_permute_pd', a, imm8} def __permute{a:T, imm8 & match{T,[2]f64} & num{imm8}} = emit{T, '_mm_permute_pd', a, imm8} def __permute2f128{a:T, b:T, imm8 & match{T,[8]f32} & num{imm8}} = emit{T, '_mm256_permute2f128_ps', a, b, imm8} def __permute2f128{a:T, b:T, imm8 & match{T,[4]f64} & num{imm8}} = emit{T, '_mm256_permute2f128_pd', a, b, imm8} def __permute2f128{a:T, b:T, imm8 & intvec{256,T} & num{imm8}} = emit{T, '_mm256_permute2f128_si256', a, b, imm8} def __insertf128{a:T, b:S, imm8 & match{T,[8]f32} & match{S,[4]f32} & num{imm8}} = emit{T, '_mm256_insertf128_ps', a, b, imm8} def __insertf128{a:T, b:S, imm8 & match{T,[4]f64} & match{S,[2]f64} & num{imm8}} = emit{T, '_mm256_insertf128_pd', a, b, imm8} def __insertf128{a:T, b:S, imm8 & intvec{256,T} & intvec{128,S} & num{imm8}} = emit{T, '_mm256_insertf128_si256', a, b, imm8} def __insert{a:T, i:S, index & match{T,[32]i8} & match{S,i8} & num{index}} = emit{T, '_mm256_insert_epi8', a, i, index} def __insert{a:T, i:S, index & match{T,[16]i16} & match{S,i16} & num{index}} = emit{T, '_mm256_insert_epi16', a, i, index} def __insert{a:T, i:S, index & match{T,[8]i32} & match{S,i32} & num{index}} = emit{T, '_mm256_insert_epi32', a, i, index} def __insert{a:T, i:S, index & match{T,[4]i64} & match{S,i64} & num{index}} = emit{T, '_mm256_insert_epi64', a, i, index} def __unpackhi{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_unpackhi_pd', a, b} def __unpackhi{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_unpackhi_ps', a, b} def __unpacklo{a:T, b:T & match{T,[4]f64}} = emit{T, '_mm256_unpacklo_pd', a, b} def __unpacklo{a:T, b:T & match{T,[8]f32}} = emit{T, '_mm256_unpacklo_ps', a, b} def __broadcast{mem_addr:T & match{T,__pnt{[4]f32}}} = emit{[8]f32, '_mm256_broadcast_ps', mem_addr} def __broadcast{mem_addr:T & match{T,__pnt{[2]f64}}} = emit{[4]f64, '_mm256_broadcast_pd', mem_addr} #AVX2 def __add{a:T, b:T & match{T,[32]i8}} = emit{T, '_mm256_add_epi8', a, b} def __add{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_add_epi16', a, b} def __add{a:T, b:T & match{T,[8]i32}} = emit{T, '_mm256_add_epi32', a, b} def __add{a:T, b:T & match{T,[4]i64}} = emit{T, '_mm256_add_epi64', a, b} def __adds{a:T, b:T & match{T,[32]i8}} = emit{T, '_mm256_adds_epi8', a, b} def __adds{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_adds_epi16', a, b} def __adds{a:T, b:T & match{T,[32]u8}} = emit{T, '_mm256_adds_epu8', a, b} def __adds{a:T, b:T & match{T,[16]u16}} = emit{T, '_mm256_adds_epu16', a, b} def __hadd{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_hadd_epi16', a, b} def __hadd{a:T, b:T & match{T,[8]i32}} = emit{T, '_mm256_hadd_epi32', a, b} def __hadds{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_hadds_epi16', a, b} def __hsub{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_hsub_epi16', a, b} def __hsub{a:T, b:T & match{T,[8]i32}} = emit{T, '_mm256_hsub_epi32', a, b} def __hsubs{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_hsubs_epi16', a, b} def __madd{a:T, b:T & match{T,[16]i16}} = emit{[8]i32, '_mm256_madd_epi16', a, b} def __maddubs{a:T, b:T & match{T,[32]i8}} = emit{[16]i16, '_mm256_maddubs_epi16', a, b} def __mul{a:T, b:T & match{T,[8]i32}} = emit{[4]i64, '_mm256_mul_epi32', a, b} def __mul{a:T, b:T & match{T,[8]u32}} = emit{[4]u64, '_mm256_mul_epu32', a, b} def __mulhi{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_mulhi_epi16', a, b} def __mulhi{a:T, b:T & match{T,[16]u16}} = emit{T, '_mm256_mulhi_epu16', a, b} def __mulhrs{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_mulhrs_epi16', a, b} def __mullo{a:T, b:T & match{T,[16]i16}} = emit{[16]u16, '_mm256_mullo_epi16', a, b} def __mullo{a:T, b:T & match{T,[8]i32}} = emit{[8]u32, '_mm256_mullo_epi32', a, b} def __sad{a:T, b:T & match{T,[32]u8}} = emit{[16]u16, '_mm256_sad_epu8', a, b} def __sign{a:T, b:T & match{T,[32]i8}} = emit{T, '_mm256_sign_epi8', a, b} def __sign{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_sign_epi16', a, b} def __sign{a:T, b:T & match{T,[8]i32}} = emit{T, '_mm256_sign_epi32', a, b} def __sub{a:T, b:T & match{T,[32]i8}} = emit{T, '_mm256_sub_epi8', a, b} def __sub{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_sub_epi16', a, b} def __sub{a:T, b:T & match{T,[8]i32}} = emit{T, '_mm256_sub_epi32', a, b} def __sub{a:T, b:T & match{T,[4]i64}} = emit{T, '_mm256_sub_epi64', a, b} def __subs{a:T, b:T & match{T,[32]i8}} = emit{T, '_mm256_subs_epi8', a, b} def __subs{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_subs_epi16', a, b} def __subs{a:T, b:T & match{T,[32]u8}} = emit{T, '_mm256_subs_epu8', a, b} def __subs{a:T, b:T & match{T,[16]u16}} = emit{T, '_mm256_subs_epu16', a, b} def __cmpeq{a:T, b:T & match{T,[32]i8}} = emit{T, '_mm256_cmpeq_epi8', a, b} def __cmpeq{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_cmpeq_epi16', a, b} def __cmpeq{a:T, b:T & match{T,[8]i32}} = emit{T, '_mm256_cmpeq_epi32', a, b} def __cmpeq{a:T, b:T & match{T,[4]i64}} = emit{T, '_mm256_cmpeq_epi64', a, b} def __cmpgt{a:T, b:T & match{T,[32]i8}} = emit{T, '_mm256_cmpgt_epi8', a, b} def __cmpgt{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_cmpgt_epi16', a, b} def __cmpgt{a:T, b:T & match{T,[8]i32}} = emit{T, '_mm256_cmpgt_epi32', a, b} def __cmpgt{a:T, b:T & match{T,[4]i64}} = emit{T, '_mm256_cmpgt_epi64', a, b} def __cvtepi16_8i32{a:T & match{T,[8]i16}} = emit{[8]i32, '_mm256_cvtepi16_epi32', a} def __cvtepi16_4i64{a:T & match{T,[8]i16}} = emit{[4]i64, '_mm256_cvtepi16_epi64', a} def __cvtepi32_4i64{a:T & match{T,[4]i32}} = emit{[4]i64, '_mm256_cvtepi32_epi64', a} def __cvtepi8_16i16{a:T & match{T,[16]i8}} = emit{[16]i16, '_mm256_cvtepi8_epi16', a} def __cvtepi8_8i32{a:T & match{T,[16]i8}} = emit{[8]i32, '_mm256_cvtepi8_epi32', a} def __cvtepi8_4i64{a:T & match{T,[16]i8}} = emit{[4]i64, '_mm256_cvtepi8_epi64', a} def __cvtepu16_8i32{a:T & match{T,[8]i16}} = emit{[8]i32, '_mm256_cvtepu16_epi32', a} def __cvtepu16_4i64{a:T & match{T,[8]i16}} = emit{[4]i64, '_mm256_cvtepu16_epi64', a} def __cvtepu32_4i64{a:T & match{T,[4]i32}} = emit{[4]i64, '_mm256_cvtepu32_epi64', a} def __cvtepu8_16i16{a:T & match{T,[16]i8}} = emit{[16]i16, '_mm256_cvtepu8_epi16', a} def __cvtepu8_8i32{a:T & match{T,[16]i8}} = emit{[8]i32, '_mm256_cvtepu8_epi32', a} def __cvtepu8_4i64{a:T & match{T,[16]i8}} = emit{[4]i64, '_mm256_cvtepu8_epi64', a} def __i32gather_2f64{base_addr:T, vindex:S, scale & match{T,__pnt{f64}} & match{S,[4]i32} & num{scale}} = emit{[2]f64, '_mm_i32gather_pd', base_addr, vindex, scale} def __i32gather_4f64{base_addr:T, vindex:S, scale & match{T,__pnt{f64}} & match{S,[4]i32} & num{scale}} = emit{[4]f64, '_mm256_i32gather_pd', base_addr, vindex, scale} def __i32gather{base_addr:T, vindex:S, scale & match{T,__pnt{f32}} & match{S,[4]i32} & num{scale}} = emit{[4]f32, '_mm_i32gather_ps', base_addr, vindex, scale} def __i32gather{base_addr:T, vindex:S, scale & match{T,__pnt{f32}} & match{S,[8]i32} & num{scale}} = emit{[8]f32, '_mm256_i32gather_ps', base_addr, vindex, scale} def __i32gather{base_addr:T, vindex:S, scale & match{T,__pnt{i32}} & match{S,[4]i32} & num{scale}} = emit{S, '_mm_i32gather_epi32', base_addr, vindex, scale} def __i32gather{base_addr:T, vindex:S, scale & match{T,__pnt{i32}} & match{S,[8]i32} & num{scale}} = emit{S, '_mm256_i32gather_epi32', base_addr, vindex, scale} def __i32gather_2i64{base_addr:T, vindex:S, scale & match{T,__pnt{i64}} & match{S,[4]i32} & num{scale}} = emit{[2]i64, '_mm_i32gather_epi64', base_addr, vindex, scale} def __i32gather_4i64{base_addr:T, vindex:S, scale & match{T,__pnt{i64}} & match{S,[4]i32} & num{scale}} = emit{[4]i64, '_mm256_i32gather_epi64', base_addr, vindex, scale} def __i64gather{base_addr:T, vindex:S, scale & match{T,__pnt{f64}} & match{S,[2]i64} & num{scale}} = emit{[2]f64, '_mm_i64gather_pd', base_addr, vindex, scale} def __i64gather{base_addr:T, vindex:S, scale & match{T,__pnt{f64}} & match{S,[4]i64} & num{scale}} = emit{[4]f64, '_mm256_i64gather_pd', base_addr, vindex, scale} def __i64gather{base_addr:T, vindex:S, scale & match{T,__pnt{f32}} & match{S,[2]i64} & num{scale}} = emit{[4]f32, '_mm_i64gather_ps', base_addr, vindex, scale} def __i64gather{base_addr:T, vindex:S, scale & match{T,__pnt{f32}} & match{S,[4]i64} & num{scale}} = emit{[4]f32, '_mm256_i64gather_ps', base_addr, vindex, scale} def __i64gather{base_addr:T, vindex:S, scale & match{T,__pnt{i32}} & match{S,[2]i64} & num{scale}} = emit{[4]i32, '_mm_i64gather_epi32', base_addr, vindex, scale} def __i64gather{base_addr:T, vindex:S, scale & match{T,__pnt{i32}} & match{S,[4]i64} & num{scale}} = emit{[4]i32, '_mm256_i64gather_epi32', base_addr, vindex, scale} def __i64gather{base_addr:T, vindex:S, scale & match{T,__pnt{i64}} & match{S,[2]i64} & num{scale}} = emit{S, '_mm_i64gather_epi64', base_addr, vindex, scale} def __i64gather{base_addr:T, vindex:S, scale & match{T,__pnt{i64}} & match{S,[4]i64} & num{scale}} = emit{S, '_mm256_i64gather_epi64', base_addr, vindex, scale} def __mask_i32gather{src:T, base_addr:S, vindex:R, mask:Q, scale & match{T,[2]f64} & match{S,__pnt{f64}} & match{R,[4]i32} & intvec{128,Q} & num{scale}} = emit{T, '_mm_mask_i32gather_pd', src, base_addr, vindex, mask, scale} def __mask_i32gather{src:T, base_addr:S, vindex:R, mask:Q, scale & match{T,[4]f64} & match{S,__pnt{f64}} & match{R,[4]i32} & intvec{256,Q} & num{scale}} = emit{T, '_mm256_mask_i32gather_pd', src, base_addr, vindex, mask, scale} def __mask_i32gather{src:T, base_addr:S, vindex:R, mask:Q, scale & match{T,[4]f32} & match{S,__pnt{f32}} & match{R,[4]i32} & intvec{128,Q} & num{scale}} = emit{T, '_mm_mask_i32gather_ps', src, base_addr, vindex, mask, scale} def __mask_i32gather{src:T, base_addr:S, vindex:R, mask:Q, scale & match{T,[8]f32} & match{S,__pnt{f32}} & match{R,[8]i32} & intvec{256,Q} & num{scale}} = emit{T, '_mm256_mask_i32gather_ps', src, base_addr, vindex, mask, scale} def __mask_i32gather{src:T, base_addr:S, vindex:T, mask:R, scale & match{T,[4]i32} & match{S,__pnt{i32}} & intvec{128,R} & num{scale}} = emit{T, '_mm_mask_i32gather_epi32', src, base_addr, vindex, mask, scale} def __mask_i32gather{src:T, base_addr:S, vindex:T, mask:R, scale & match{T,[8]i32} & match{S,__pnt{i32}} & intvec{256,R} & num{scale}} = emit{T, '_mm256_mask_i32gather_epi32', src, base_addr, vindex, mask, scale} def __mask_i32gather{src:T, base_addr:S, vindex:R, mask:Q, scale & match{T,[2]i64} & match{S,__pnt{i64}} & match{R,[4]i32} & intvec{128,Q} & num{scale}} = emit{T, '_mm_mask_i32gather_epi64', src, base_addr, vindex, mask, scale} def __mask_i32gather{src:T, base_addr:S, vindex:R, mask:Q, scale & match{T,[4]i64} & match{S,__pnt{i64}} & match{R,[4]i32} & intvec{256,Q} & num{scale}} = emit{T, '_mm256_mask_i32gather_epi64', src, base_addr, vindex, mask, scale} def __mask_i64gather{src:T, base_addr:S, vindex:R, mask:Q, scale & match{T,[2]f64} & match{S,__pnt{f64}} & match{R,[2]i64} & intvec{128,Q} & num{scale}} = emit{T, '_mm_mask_i64gather_pd', src, base_addr, vindex, mask, scale} def __mask_i64gather{src:T, base_addr:S, vindex:R, mask:Q, scale & match{T,[4]f64} & match{S,__pnt{f64}} & match{R,[4]i64} & intvec{256,Q} & num{scale}} = emit{T, '_mm256_mask_i64gather_pd', src, base_addr, vindex, mask, scale} def __mask_i64gather{src:T, base_addr:S, vindex:R, mask:Q, scale & match{T,[4]f32} & match{S,__pnt{f32}} & match{R,[2]i64} & intvec{128,Q} & num{scale}} = emit{T, '_mm_mask_i64gather_ps', src, base_addr, vindex, mask, scale} def __mask_i64gather{src:T, base_addr:S, vindex:R, mask:Q, scale & match{T,[4]f32} & match{S,__pnt{f32}} & match{R,[4]i64} & intvec{128,Q} & num{scale}} = emit{T, '_mm256_mask_i64gather_ps', src, base_addr, vindex, mask, scale} def __mask_i64gather{src:T, base_addr:S, vindex:R, mask:Q, scale & match{T,[4]i32} & match{S,__pnt{i32}} & match{R,[2]i64} & intvec{128,Q} & num{scale}} = emit{T, '_mm_mask_i64gather_epi32', src, base_addr, vindex, mask, scale} def __mask_i64gather{src:T, base_addr:S, vindex:R, mask:Q, scale & match{T,[4]i32} & match{S,__pnt{i32}} & match{R,[4]i64} & intvec{128,Q} & num{scale}} = emit{T, '_mm256_mask_i64gather_epi32', src, base_addr, vindex, mask, scale} def __mask_i64gather{src:T, base_addr:S, vindex:T, mask:R, scale & match{T,[2]i64} & match{S,__pnt{i64}} & intvec{128,R} & num{scale}} = emit{T, '_mm_mask_i64gather_epi64', src, base_addr, vindex, mask, scale} def __mask_i64gather{src:T, base_addr:S, vindex:T, mask:R, scale & match{T,[4]i64} & match{S,__pnt{i64}} & intvec{256,R} & num{scale}} = emit{T, '_mm256_mask_i64gather_epi64', src, base_addr, vindex, mask, scale} def __maskload{mem_addr:T, mask:S & match{T,__pnt{i32}} & intvec{128,S}} = emit{[4]i32, '_mm_maskload_epi32', mem_addr, mask} def __maskload{mem_addr:T, mask:S & match{T,__pnt{i32}} & intvec{256,S}} = emit{[8]i32, '_mm256_maskload_epi32', mem_addr, mask} def __maskload{mem_addr:T, mask:S & match{T,__pnt{i64}} & intvec{128,S}} = emit{[2]i64, '_mm_maskload_epi64', mem_addr, mask} def __maskload{mem_addr:T, mask:S & match{T,__pnt{i64}} & intvec{256,S}} = emit{[4]i64, '_mm256_maskload_epi64', mem_addr, mask} def __and{a:T, b:T & intvec{256,T}} = emit{T, '_mm256_and_si256', a, b} def __andnot{a:T, b:T & intvec{256,T}} = emit{T, '_mm256_andnot_si256', a, b} def __or{a:T, b:T & intvec{256,T}} = emit{T, '_mm256_or_si256', a, b} def __xor{a:T, b:T & intvec{256,T}} = emit{T, '_mm256_xor_si256', a, b} def __alignr{a:T, b:T, imm8 & match{T,[32]i8} & num{imm8}} = emit{T, '_mm256_alignr_epi8', a, b, imm8} def __movemask{a:T & match{T,[32]i8}} = emit{i32, '_mm256_movemask_epi8', a} def __mpsadbw{a:T, b:T, imm8 & match{T,[32]u8} & num{imm8}} = emit{T, '_mm256_mpsadbw_epu8', a, b, imm8} def __packs{a:T, b:T & match{T,[16]i16}} = emit{[32]i8, '_mm256_packs_epi16', a, b} def __packs{a:T, b:T & match{T,[8]i32}} = emit{[16]i16, '_mm256_packs_epi32', a, b} def __packus{a:T, b:T & match{T,[16]i16}} = emit{[32]i8, '_mm256_packus_epi16', a, b} def __packus{a:T, b:T & match{T,[8]i32}} = emit{[16]i16, '_mm256_packus_epi32', a, b} def __avg{a:T, b:T & match{T,[32]u8}} = emit{T, '_mm256_avg_epu8', a, b} def __avg{a:T, b:T & match{T,[16]u16}} = emit{T, '_mm256_avg_epu16', a, b} def __slli{a:T, imm8 & intvec{256,T} & num{imm8}} = emit{T, '_mm256_slli_si256', a, imm8} def __bslli{a:T, imm8 & intvec{256,T} & num{imm8}} = emit{T, '_mm256_bslli_epi128', a, imm8} def __sll{a:T, count:S & match{T,[16]i16} & match{S,[8]i16}} = emit{T, '_mm256_sll_epi16', a, count} def __slli{a:T, imm8 & match{T,[16]i16} & num{imm8}} = emit{T, '_mm256_slli_epi16', a, imm8} def __sll{a:T, count:S & match{T,[8]i32} & match{S,[4]i32}} = emit{T, '_mm256_sll_epi32', a, count} def __slli{a:T, imm8 & match{T,[8]i32} & num{imm8}} = emit{T, '_mm256_slli_epi32', a, imm8} def __sll{a:T, count:S & match{T,[4]i64} & match{S,[2]i64}} = emit{T, '_mm256_sll_epi64', a, count} def __slli{a:T, imm8 & match{T,[4]i64} & num{imm8}} = emit{T, '_mm256_slli_epi64', a, imm8} def __sllv{a:T, count:T & match{T,[4]i32}} = emit{T, '_mm_sllv_epi32', a, count} def __sllv{a:T, count:T & match{T,[8]i32}} = emit{T, '_mm256_sllv_epi32', a, count} def __sllv{a:T, count:T & match{T,[2]i64}} = emit{T, '_mm_sllv_epi64', a, count} def __sllv{a:T, count:T & match{T,[4]i64}} = emit{T, '_mm256_sllv_epi64', a, count} def __sra{a:T, count:S & match{T,[16]i16} & match{S,[8]i16}} = emit{T, '_mm256_sra_epi16', a, count} def __srai{a:T, imm8 & match{T,[16]i16} & num{imm8}} = emit{T, '_mm256_srai_epi16', a, imm8} def __sra{a:T, count:S & match{T,[8]i32} & match{S,[4]i32}} = emit{T, '_mm256_sra_epi32', a, count} def __srai{a:T, imm8 & match{T,[8]i32} & num{imm8}} = emit{T, '_mm256_srai_epi32', a, imm8} def __srav{a:T, count:T & match{T,[4]i32}} = emit{T, '_mm_srav_epi32', a, count} def __srav{a:T, count:T & match{T,[8]i32}} = emit{T, '_mm256_srav_epi32', a, count} def __srli{a:T, imm8 & intvec{256,T} & num{imm8}} = emit{T, '_mm256_srli_si256', a, imm8} def __bsrli{a:T, imm8 & intvec{256,T} & num{imm8}} = emit{T, '_mm256_bsrli_epi128', a, imm8} def __srl{a:T, count:S & match{T,[16]u16} & match{S,[8]u16}} = emit{T, '_mm256_srl_epi16', a, count} def __srli{a:T, imm8 & match{T,[16]u16} & num{imm8}} = emit{T, '_mm256_srli_epi16', a, imm8} def __srl{a:T, count:S & match{T,[8]u32} & match{S,[4]u32}} = emit{T, '_mm256_srl_epi32', a, count} def __srli{a:T, imm8 & match{T,[8]u32} & num{imm8}} = emit{T, '_mm256_srli_epi32', a, imm8} def __srl{a:T, count:S & match{T,[4]u64} & match{S,[2]u64}} = emit{T, '_mm256_srl_epi64', a, count} def __srli{a:T, imm8 & match{T,[4]u64} & num{imm8}} = emit{T, '_mm256_srli_epi64', a, imm8} def __srlv{a:T, count:T & match{T,[4]u32}} = emit{T, '_mm_srlv_epi32', a, count} def __srlv{a:T, count:T & match{T,[8]u32}} = emit{T, '_mm256_srlv_epi32', a, count} def __srlv{a:T, count:T & match{T,[2]u64}} = emit{T, '_mm_srlv_epi64', a, count} def __srlv{a:T, count:T & match{T,[4]u64}} = emit{T, '_mm256_srlv_epi64', a, count} def __abs{a:T & match{T,[32]i8}} = emit{[32]u8, '_mm256_abs_epi8', a} def __abs{a:T & match{T,[16]i16}} = emit{[16]u16, '_mm256_abs_epi16', a} def __abs{a:T & match{T,[8]i32}} = emit{[8]u32, '_mm256_abs_epi32', a} def __max{a:T, b:T & match{T,[32]i8}} = emit{T, '_mm256_max_epi8', a, b} def __max{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_max_epi16', a, b} def __max{a:T, b:T & match{T,[8]i32}} = emit{T, '_mm256_max_epi32', a, b} def __max{a:T, b:T & match{T,[32]u8}} = emit{T, '_mm256_max_epu8', a, b} def __max{a:T, b:T & match{T,[16]u16}} = emit{T, '_mm256_max_epu16', a, b} def __max{a:T, b:T & match{T,[8]u32}} = emit{T, '_mm256_max_epu32', a, b} def __min{a:T, b:T & match{T,[32]i8}} = emit{T, '_mm256_min_epi8', a, b} def __min{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_min_epi16', a, b} def __min{a:T, b:T & match{T,[8]i32}} = emit{T, '_mm256_min_epi32', a, b} def __min{a:T, b:T & match{T,[32]u8}} = emit{T, '_mm256_min_epu8', a, b} def __min{a:T, b:T & match{T,[16]u16}} = emit{T, '_mm256_min_epu16', a, b} def __min{a:T, b:T & match{T,[8]u32}} = emit{T, '_mm256_min_epu32', a, b} def __maskstore{mem_addr:T, mask:S, a:R & match{T,__pnt{i32}} & intvec{128,S} & match{R,[4]i32}} = emit{void, '_mm_maskstore_epi32', mem_addr, mask, a} def __maskstore{mem_addr:T, mask:S, a:R & match{T,__pnt{i32}} & intvec{256,S} & match{R,[8]i32}} = emit{void, '_mm256_maskstore_epi32', mem_addr, mask, a} def __maskstore{mem_addr:T, mask:S, a:R & match{T,__pnt{i64}} & intvec{128,S} & match{R,[2]i64}} = emit{void, '_mm_maskstore_epi64', mem_addr, mask, a} def __maskstore{mem_addr:T, mask:S, a:R & match{T,__pnt{i64}} & intvec{256,S} & match{R,[4]i64}} = emit{void, '_mm256_maskstore_epi64', mem_addr, mask, a} def __extract{a:T, index & match{T,[32]i8} & num{index}} = emit{i32, '_mm256_extract_epi8', a, index} def __extract{a:T, index & match{T,[16]i16} & num{index}} = emit{i32, '_mm256_extract_epi16', a, index} def __blend{a:T, b:T, imm8 & match{T,[16]i16} & num{imm8}} = emit{T, '_mm256_blend_epi16', a, b, imm8} def __blend{a:T, b:T, imm8 & match{T,[4]i32} & num{imm8}} = emit{T, '_mm_blend_epi32', a, b, imm8} def __blend{a:T, b:T, imm8 & match{T,[8]i32} & num{imm8}} = emit{T, '_mm256_blend_epi32', a, b, imm8} def __blendv{a:T, b:T, mask:S & match{T,[32]i8} & intvec{256,S}} = emit{T, '_mm256_blendv_epi8', a, b, mask} def __broadcastb_16i8{a:T & match{T,[16]i8}} = emit{T, '_mm_broadcastb_epi8', a} def __broadcastb_32i8{a:T & match{T,[16]i8}} = emit{[32]i8, '_mm256_broadcastb_epi8', a} def __broadcastd_4i32{a:T & match{T,[4]i32}} = emit{T, '_mm_broadcastd_epi32', a} def __broadcastd_8i32{a:T & match{T,[4]i32}} = emit{[8]i32, '_mm256_broadcastd_epi32', a} def __broadcastq_2i64{a:T & match{T,[2]i64}} = emit{T, '_mm_broadcastq_epi64', a} def __broadcastq_4i64{a:T & match{T,[2]i64}} = emit{[4]i64, '_mm256_broadcastq_epi64', a} def __broadcastsd_2f64{a:T & match{T,[2]f64}} = emit{T, '_mm_broadcastsd_pd', a} def __broadcastsd_4f64{a:T & match{T,[2]f64}} = emit{[4]f64, '_mm256_broadcastsd_pd', a} def __broadcastss_4f32{a:T & match{T,[4]f32}} = emit{T, '_mm_broadcastss_ps', a} def __broadcastss_8f32{a:T & match{T,[4]f32}} = emit{[8]f32, '_mm256_broadcastss_ps', a} def __broadcastw_8i16{a:T & match{T,[8]i16}} = emit{T, '_mm_broadcastw_epi16', a} def __broadcastw_16i16{a:T & match{T,[8]i16}} = emit{[16]i16, '_mm256_broadcastw_epi16', a} def __inserti128{a:T, b:S, imm8 & intvec{256,T} & intvec{128,S} & num{imm8}} = emit{T, '_mm256_inserti128_si256', a, b, imm8} def __permute2x128{a:T, b:T, imm8 & intvec{256,T} & num{imm8}} = emit{T, '_mm256_permute2x128_si256', a, b, imm8} def __permute4x64{a:T, imm8 & match{T,[4]i64} & num{imm8}} = emit{T, '_mm256_permute4x64_epi64', a, imm8} def __permute4x64{a:T, imm8 & match{T,[4]f64} & num{imm8}} = emit{T, '_mm256_permute4x64_pd', a, imm8} def __permutevar8x32{a:T, idx:T & match{T,[8]i32}} = emit{T, '_mm256_permutevar8x32_epi32', a, idx} def __permutevar8x32{a:T, idx:S & match{T,[8]f32} & match{S,[8]u32}} = emit{T, '_mm256_permutevar8x32_ps', a, idx} def __shuffle{a:T, imm8 & match{T,[8]i32} & num{imm8}} = emit{T, '_mm256_shuffle_epi32', a, imm8} def __shuffle{a:T, b:T & match{T,[32]i8}} = emit{T, '_mm256_shuffle_epi8', a, b} def __shufflehi{a:T, imm8 & match{T,[16]i16} & num{imm8}} = emit{T, '_mm256_shufflehi_epi16', a, imm8} def __shufflelo{a:T, imm8 & match{T,[16]i16} & num{imm8}} = emit{T, '_mm256_shufflelo_epi16', a, imm8} def __unpackhi{a:T, b:T & match{T,[32]i8}} = emit{T, '_mm256_unpackhi_epi8', a, b} def __unpackhi{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_unpackhi_epi16', a, b} def __unpackhi{a:T, b:T & match{T,[8]i32}} = emit{T, '_mm256_unpackhi_epi32', a, b} def __unpackhi{a:T, b:T & match{T,[4]i64}} = emit{T, '_mm256_unpackhi_epi64', a, b} def __unpacklo{a:T, b:T & match{T,[32]i8}} = emit{T, '_mm256_unpacklo_epi8', a, b} def __unpacklo{a:T, b:T & match{T,[16]i16}} = emit{T, '_mm256_unpacklo_epi16', a, b} def __unpacklo{a:T, b:T & match{T,[8]i32}} = emit{T, '_mm256_unpacklo_epi32', a, b} def __unpacklo{a:T, b:T & match{T,[4]i64}} = emit{T, '_mm256_unpacklo_epi64', a, b} #FMA def __fmadd{a:T, b:T, c:T & match{T,[2]f64}} = emit{T, '_mm_fmadd_pd', a, b, c} def __fmadd{a:T, b:T, c:T & match{T,[4]f64}} = emit{T, '_mm256_fmadd_pd', a, b, c} def __fmadd{a:T, b:T, c:T & match{T,[4]f32}} = emit{T, '_mm_fmadd_ps', a, b, c} def __fmadd{a:T, b:T, c:T & match{T,[8]f32}} = emit{T, '_mm256_fmadd_ps', a, b, c} def __fmaddsub{a:T, b:T, c:T & match{T,[2]f64}} = emit{T, '_mm_fmaddsub_pd', a, b, c} def __fmaddsub{a:T, b:T, c:T & match{T,[4]f64}} = emit{T, '_mm256_fmaddsub_pd', a, b, c} def __fmaddsub{a:T, b:T, c:T & match{T,[4]f32}} = emit{T, '_mm_fmaddsub_ps', a, b, c} def __fmaddsub{a:T, b:T, c:T & match{T,[8]f32}} = emit{T, '_mm256_fmaddsub_ps', a, b, c} def __fmsub{a:T, b:T, c:T & match{T,[2]f64}} = emit{T, '_mm_fmsub_pd', a, b, c} def __fmsub{a:T, b:T, c:T & match{T,[4]f64}} = emit{T, '_mm256_fmsub_pd', a, b, c} def __fmsub{a:T, b:T, c:T & match{T,[4]f32}} = emit{T, '_mm_fmsub_ps', a, b, c} def __fmsub{a:T, b:T, c:T & match{T,[8]f32}} = emit{T, '_mm256_fmsub_ps', a, b, c} def __fmsubadd{a:T, b:T, c:T & match{T,[2]f64}} = emit{T, '_mm_fmsubadd_pd', a, b, c} def __fmsubadd{a:T, b:T, c:T & match{T,[4]f64}} = emit{T, '_mm256_fmsubadd_pd', a, b, c} def __fmsubadd{a:T, b:T, c:T & match{T,[4]f32}} = emit{T, '_mm_fmsubadd_ps', a, b, c} def __fmsubadd{a:T, b:T, c:T & match{T,[8]f32}} = emit{T, '_mm256_fmsubadd_ps', a, b, c} def __fnmadd{a:T, b:T, c:T & match{T,[2]f64}} = emit{T, '_mm_fnmadd_pd', a, b, c} def __fnmadd{a:T, b:T, c:T & match{T,[4]f64}} = emit{T, '_mm256_fnmadd_pd', a, b, c} def __fnmadd{a:T, b:T, c:T & match{T,[4]f32}} = emit{T, '_mm_fnmadd_ps', a, b, c} def __fnmadd{a:T, b:T, c:T & match{T,[8]f32}} = emit{T, '_mm256_fnmadd_ps', a, b, c} def __fnmsub{a:T, b:T, c:T & match{T,[2]f64}} = emit{T, '_mm_fnmsub_pd', a, b, c} def __fnmsub{a:T, b:T, c:T & match{T,[4]f64}} = emit{T, '_mm256_fnmsub_pd', a, b, c} def __fnmsub{a:T, b:T, c:T & match{T,[4]f32}} = emit{T, '_mm_fnmsub_ps', a, b, c} def __fnmsub{a:T, b:T, c:T & match{T,[8]f32}} = emit{T, '_mm256_fnmsub_ps', a, b, c}