/*---------------------------------------------------------------*/ /*--- begin host_amd64_isel.c ---*/ /*---------------------------------------------------------------*/ /* This file is part of Valgrind, a dynamic binary instrumentation framework. Copyright (C) 2004-2017 OpenWorks LLP info@open-works.net This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, see . The GNU General Public License is contained in the file COPYING. Neither the names of the U.S. Department of Energy nor the University of California nor the names of its contributors may be used to endorse or promote products derived from this software without prior written permission. */ #include "libvex_basictypes.h" #include "libvex_ir.h" #include "libvex.h" #include "ir_match.h" #include "main_util.h" #include "main_globals.h" #include "host_generic_regs.h" #include "host_generic_simd64.h" #include "host_generic_simd128.h" #include "host_generic_simd256.h" #include "host_generic_maddf.h" #include "host_amd64_defs.h" /*---------------------------------------------------------*/ /*--- x87/SSE control word stuff ---*/ /*---------------------------------------------------------*/ /* Vex-generated code expects to run with the FPU set as follows: all exceptions masked, round-to-nearest, precision = 53 bits. This corresponds to a FPU control word value of 0x027F. Similarly the SSE control word (%mxcsr) should be 0x1F80. %fpucw and %mxcsr should have these values on entry to Vex-generated code, and should those values should be unchanged at exit. */ #define DEFAULT_FPUCW 0x027F #define DEFAULT_MXCSR 0x1F80 /* debugging only, do not use */ /* define DEFAULT_FPUCW 0x037F */ /*---------------------------------------------------------*/ /*--- misc helpers ---*/ /*---------------------------------------------------------*/ /* These are duplicated in guest-amd64/toIR.c */ static IRExpr* unop ( IROp op, IRExpr* a ) { return IRExpr_Unop(op, a); } static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 ) { return IRExpr_Binop(op, a1, a2); } static IRExpr* bind ( Int binder ) { return IRExpr_Binder(binder); } static Bool isZeroU8 ( const IRExpr* e ) { return e->tag == Iex_Const && e->Iex.Const.con->tag == Ico_U8 && e->Iex.Const.con->Ico.U8 == 0; } /*---------------------------------------------------------*/ /*--- ISelEnv ---*/ /*---------------------------------------------------------*/ /* This carries around: - A mapping from IRTemp to IRType, giving the type of any IRTemp we might encounter. This is computed before insn selection starts, and does not change. - A mapping from IRTemp to HReg. This tells the insn selector which virtual register is associated with each IRTemp temporary. This is computed before insn selection starts, and does not change. We expect this mapping to map precisely the same set of IRTemps as the type mapping does. - vregmap holds the primary register for the IRTemp. - vregmapHI is only used for 128-bit integer-typed IRTemps. It holds the identity of a second 64-bit virtual HReg, which holds the high half of the value. - The host subarchitecture we are selecting insns for. This is set at the start and does not change. - The code array, that is, the insns selected so far. - A counter, for generating new virtual registers. - A Bool for indicating whether we may generate chain-me instructions for control flow transfers, or whether we must use XAssisted. - The maximum guest address of any guest insn in this block. Actually, the address of the highest-addressed byte from any insn in this block. Is set at the start and does not change. This is used for detecting jumps which are definitely forward-edges from this block, and therefore can be made (chained) to the fast entry point of the destination, thereby avoiding the destination's event check. Note, this is all host-independent. (JRS 20050201: well, kinda ... not completely. Compare with ISelEnv for X86.) */ typedef struct { /* Constant -- are set at the start and do not change. */ IRTypeEnv* type_env; HReg* vregmap; HReg* vregmapHI; Int n_vregmap; UInt hwcaps; Bool chainingAllowed; Addr64 max_ga; /* These are modified as we go along. */ HInstrArray* code; Int vreg_ctr; } ISelEnv; static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp ) { vassert(tmp >= 0); vassert(tmp < env->n_vregmap); return env->vregmap[tmp]; } static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO, ISelEnv* env, IRTemp tmp ) { vassert(tmp >= 0); vassert(tmp < env->n_vregmap); vassert(! hregIsInvalid(env->vregmapHI[tmp])); *vrLO = env->vregmap[tmp]; *vrHI = env->vregmapHI[tmp]; } static void addInstr ( ISelEnv* env, AMD64Instr* instr ) { addHInstr(env->code, instr); if (vex_traceflags & VEX_TRACE_VCODE) { ppAMD64Instr(instr, True); vex_printf("\n"); } } static HReg newVRegI ( ISelEnv* env ) { HReg reg = mkHReg(True/*virtual reg*/, HRcInt64, 0/*enc*/, env->vreg_ctr); env->vreg_ctr++; return reg; } static HReg newVRegV ( ISelEnv* env ) { HReg reg = mkHReg(True/*virtual reg*/, HRcVec128, 0/*enc*/, env->vreg_ctr); env->vreg_ctr++; return reg; } /*---------------------------------------------------------*/ /*--- ISEL: Forward declarations ---*/ /*---------------------------------------------------------*/ /* These are organised as iselXXX and iselXXX_wrk pairs. The iselXXX_wrk do the real work, but are not to be called directly. For each XXX, iselXXX calls its iselXXX_wrk counterpart, then checks that all returned registers are virtual. You should not call the _wrk version directly. */ static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e ); static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e ); static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e ); static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e ); static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e ); static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e ); static HReg iselIntExpr_R_wrk ( ISelEnv* env, const IRExpr* e ); static HReg iselIntExpr_R ( ISelEnv* env, const IRExpr* e ); static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e ); static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e ); static void iselInt128Expr_wrk ( /*OUT*/HReg* rHi, HReg* rLo, ISelEnv* env, const IRExpr* e ); static void iselInt128Expr ( /*OUT*/HReg* rHi, HReg* rLo, ISelEnv* env, const IRExpr* e ); static AMD64CondCode iselCondCode_C_wrk ( ISelEnv* env, const IRExpr* e ); static AMD64CondCode iselCondCode_C ( ISelEnv* env, const IRExpr* e ); static HReg iselCondCode_R_wrk ( ISelEnv* env, const IRExpr* e ); static HReg iselCondCode_R ( ISelEnv* env, const IRExpr* e ); static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e ); static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e ); static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e ); static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e ); static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ); static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e ); static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, HReg* rLo, ISelEnv* env, const IRExpr* e ); static void iselDVecExpr ( /*OUT*/HReg* rHi, HReg* rLo, ISelEnv* env, const IRExpr* e ); /*---------------------------------------------------------*/ /*--- ISEL: Misc helpers ---*/ /*---------------------------------------------------------*/ static Bool sane_AMode ( AMD64AMode* am ) { switch (am->tag) { case Aam_IR: return toBool( hregClass(am->Aam.IR.reg) == HRcInt64 && (hregIsVirtual(am->Aam.IR.reg) || sameHReg(am->Aam.IR.reg, hregAMD64_RBP())) ); case Aam_IRRS: return toBool( hregClass(am->Aam.IRRS.base) == HRcInt64 && hregIsVirtual(am->Aam.IRRS.base) && hregClass(am->Aam.IRRS.index) == HRcInt64 && hregIsVirtual(am->Aam.IRRS.index) ); default: vpanic("sane_AMode: unknown amd64 amode tag"); } } /* Can the lower 32 bits be signedly widened to produce the whole 64-bit value? In other words, are the top 33 bits either all 0 or all 1 ? */ static Bool fitsIn32Bits ( ULong x ) { Long y1; y1 = x << 32; y1 >>=/*s*/ 32; return toBool(x == y1); } /* Is this a 64-bit zero expression? */ static Bool isZeroU64 ( const IRExpr* e ) { return e->tag == Iex_Const && e->Iex.Const.con->tag == Ico_U64 && e->Iex.Const.con->Ico.U64 == 0ULL; } static Bool isZeroU32 ( const IRExpr* e ) { return e->tag == Iex_Const && e->Iex.Const.con->tag == Ico_U32 && e->Iex.Const.con->Ico.U32 == 0; } /* Are both args atoms and the same? This is copy of eqIRAtom that omits the assertions that the args are indeed atoms. */ static Bool areAtomsAndEqual ( const IRExpr* a1, const IRExpr* a2 ) { if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp) return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp); if (a1->tag == Iex_Const && a2->tag == Iex_Const) return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con); return False; } /* Make a int reg-reg move. */ static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst ) { vassert(hregClass(src) == HRcInt64); vassert(hregClass(dst) == HRcInt64); return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst); } /* Make a vector (128 bit) reg-reg move. */ static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst ) { vassert(hregClass(src) == HRcVec128); vassert(hregClass(dst) == HRcVec128); return AMD64Instr_SseReRg(Asse_MOV, src, dst); } /* Advance/retreat %rsp by n. */ static void add_to_rsp ( ISelEnv* env, Int n ) { vassert(n > 0 && n < 256 && (n%8) == 0); addInstr(env, AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n), hregAMD64_RSP())); } static void sub_from_rsp ( ISelEnv* env, Int n ) { vassert(n > 0 && n < 256 && (n%8) == 0); addInstr(env, AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n), hregAMD64_RSP())); } /* Push 64-bit constants on the stack. */ static void push_uimm64( ISelEnv* env, ULong uimm64 ) { /* If uimm64 can be expressed as the sign extension of its lower 32 bits, we can do it the easy way. */ Long simm64 = (Long)uimm64; if ( simm64 == ((Long)(uimm64 << 32) >> 32) ) { addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) ); } else { HReg tmp = newVRegI(env); addInstr( env, AMD64Instr_Imm64(uimm64, tmp) ); addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) ); } } /* Used only in doHelperCall. If possible, produce a single instruction which computes 'e' into 'dst'. If not possible, return NULL. */ static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env, HReg dst, IRExpr* e ) { /* Per comments in doHelperCall below, appearance of Iex_VECRET implies ill-formed IR. */ vassert(e->tag != Iex_VECRET); /* In this case we give out a copy of the BaseBlock pointer. */ if (UNLIKELY(e->tag == Iex_GSPTR)) { return mk_iMOVsd_RR( hregAMD64_RBP(), dst ); } vassert(typeOfIRExpr(env->type_env, e) == Ity_I64); if (e->tag == Iex_Const) { vassert(e->Iex.Const.con->tag == Ico_U64); if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) { return AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)), dst ); } else { return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst); } } if (e->tag == Iex_RdTmp) { HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp); return mk_iMOVsd_RR(src, dst); } if (e->tag == Iex_Get) { vassert(e->Iex.Get.ty == Ity_I64); return AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem( AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())), dst); } if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_RdTmp) { HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp); return AMD64Instr_MovxLQ(False, src, dst); } if (0) { ppIRExpr(e); vex_printf("\n"); } return NULL; } /* Do a complete function call. |guard| is a Ity_Bit expression indicating whether or not the call happens. If guard==NULL, the call is unconditional. |retloc| is set to indicate where the return value is after the call. The caller (of this fn) must generate code to add |stackAdjustAfterCall| to the stack pointer after the call is done. */ static void doHelperCall ( /*OUT*/UInt* stackAdjustAfterCall, /*OUT*/RetLoc* retloc, ISelEnv* env, IRExpr* guard, IRCallee* cee, IRType retTy, IRExpr** args ) { AMD64CondCode cc; HReg argregs[6]; HReg tmpregs[6]; AMD64Instr* fastinstrs[6]; UInt n_args, i; /* Set default returns. We'll update them later if needed. */ *stackAdjustAfterCall = 0; *retloc = mk_RetLoc_INVALID(); /* These are used for cross-checking that IR-level constraints on the use of IRExpr_VECRET() and IRExpr_GSPTR() are observed. */ UInt nVECRETs = 0; UInt nGSPTRs = 0; /* Marshal args for a call and do the call. This function only deals with a tiny set of possibilities, which cover all helpers in practice. The restrictions are that only arguments in registers are supported, hence only 6x64 integer bits in total can be passed. In fact the only supported arg type is I64. The return type can be I{64,32,16,8} or V{128,256}. In the latter two cases, it is expected that |args| will contain the special node IRExpr_VECRET(), in which case this routine generates code to allocate space on the stack for the vector return value. Since we are not passing any scalars on the stack, it is enough to preallocate the return space before marshalling any arguments, in this case. |args| may also contain IRExpr_GSPTR(), in which case the value in %rbp is passed as the corresponding argument. Generating code which is both efficient and correct when parameters are to be passed in registers is difficult, for the reasons elaborated in detail in comments attached to doHelperCall() in priv/host-x86/isel.c. Here, we use a variant of the method described in those comments. The problem is split into two cases: the fast scheme and the slow scheme. In the fast scheme, arguments are computed directly into the target (real) registers. This is only safe when we can be sure that computation of each argument will not trash any real registers set by computation of any other argument. In the slow scheme, all args are first computed into vregs, and once they are all done, they are moved to the relevant real regs. This always gives correct code, but it also gives a bunch of vreg-to-rreg moves which are usually redundant but are hard for the register allocator to get rid of. To decide which scheme to use, all argument expressions are first examined. If they are all so simple that it is clear they will be evaluated without use of any fixed registers, use the fast scheme, else use the slow scheme. Note also that only unconditional calls may use the fast scheme, since having to compute a condition expression could itself trash real registers. Note that for simplicity, in the case where IRExpr_VECRET() is present, we use the slow scheme. This is motivated by the desire to avoid any possible complexity w.r.t. nested calls. Note this requires being able to examine an expression and determine whether or not evaluation of it might use a fixed register. That requires knowledge of how the rest of this insn selector works. Currently just the following 3 are regarded as safe -- hopefully they cover the majority of arguments in practice: IRExpr_Tmp IRExpr_Const IRExpr_Get. */ /* Note that the cee->regparms field is meaningless on AMD64 host (since there is only one calling convention) and so we always ignore it. */ n_args = 0; for (i = 0; args[i]; i++) n_args++; if (n_args > 6) vpanic("doHelperCall(AMD64): cannot currently handle > 6 args"); argregs[0] = hregAMD64_RDI(); argregs[1] = hregAMD64_RSI(); argregs[2] = hregAMD64_RDX(); argregs[3] = hregAMD64_RCX(); argregs[4] = hregAMD64_R8(); argregs[5] = hregAMD64_R9(); tmpregs[0] = tmpregs[1] = tmpregs[2] = tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG; fastinstrs[0] = fastinstrs[1] = fastinstrs[2] = fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL; /* First decide which scheme (slow or fast) is to be used. First assume the fast scheme, and select slow if any contraindications (wow) appear. */ /* We'll need space on the stack for the return value. Avoid possible complications with nested calls by using the slow scheme. */ if (retTy == Ity_V128 || retTy == Ity_V256) goto slowscheme; if (guard) { if (guard->tag == Iex_Const && guard->Iex.Const.con->tag == Ico_U1 && guard->Iex.Const.con->Ico.U1 == True) { /* unconditional */ } else { /* Not manifestly unconditional -- be conservative. */ goto slowscheme; } } /* Ok, let's try for the fast scheme. If it doesn't pan out, we'll use the slow scheme. Because this is tentative, we can't call addInstr (that is, commit to) any instructions until we're handled all the arguments. So park the resulting instructions in a buffer and emit that if we're successful. */ /* FAST SCHEME */ /* In this loop, we process args that can be computed into the destination (real) register with a single instruction, without using any fixed regs. That also includes IRExpr_GSPTR(), but not IRExpr_VECRET(). Indeed, if the IR is well-formed, we can never see IRExpr_VECRET() at this point, since the return-type check above should ensure all those cases use the slow scheme instead. */ vassert(n_args >= 0 && n_args <= 6); for (i = 0; i < n_args; i++) { IRExpr* arg = args[i]; if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(arg))) { vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64); } fastinstrs[i] = iselIntExpr_single_instruction( env, argregs[i], args[i] ); if (fastinstrs[i] == NULL) goto slowscheme; } /* Looks like we're in luck. Emit the accumulated instructions and move on to doing the call itself. */ for (i = 0; i < n_args; i++) addInstr(env, fastinstrs[i]); /* Fast scheme only applies for unconditional calls. Hence: */ cc = Acc_ALWAYS; goto handle_call; /* SLOW SCHEME; move via temporaries */ slowscheme: {} # if 0 /* debug only */ if (n_args > 0) {for (i = 0; args[i]; i++) { ppIRExpr(args[i]); vex_printf(" "); } vex_printf("\n");} # endif /* If we have a vector return type, allocate a place for it on the stack and record its address. */ HReg r_vecRetAddr = INVALID_HREG; if (retTy == Ity_V128) { r_vecRetAddr = newVRegI(env); sub_from_rsp(env, 16); addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr )); } else if (retTy == Ity_V256) { r_vecRetAddr = newVRegI(env); sub_from_rsp(env, 32); addInstr(env, mk_iMOVsd_RR( hregAMD64_RSP(), r_vecRetAddr )); } vassert(n_args >= 0 && n_args <= 6); for (i = 0; i < n_args; i++) { IRExpr* arg = args[i]; if (UNLIKELY(arg->tag == Iex_GSPTR)) { tmpregs[i] = newVRegI(env); addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[i])); nGSPTRs++; } else if (UNLIKELY(arg->tag == Iex_VECRET)) { /* We stashed the address of the return slot earlier, so just retrieve it now. */ vassert(!hregIsInvalid(r_vecRetAddr)); tmpregs[i] = r_vecRetAddr; nVECRETs++; } else { vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64); tmpregs[i] = iselIntExpr_R(env, args[i]); } } /* Now we can compute the condition. We can't do it earlier because the argument computations could trash the condition codes. Be a bit clever to handle the common case where the guard is 1:Bit. */ cc = Acc_ALWAYS; if (guard) { if (guard->tag == Iex_Const && guard->Iex.Const.con->tag == Ico_U1 && guard->Iex.Const.con->Ico.U1 == True) { /* unconditional -- do nothing */ } else { cc = iselCondCode_C( env, guard ); } } /* Move the args to their final destinations. */ for (i = 0; i < n_args; i++) { /* None of these insns, including any spill code that might be generated, may alter the condition codes. */ addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) ); } /* Do final checks, set the return values, and generate the call instruction proper. */ handle_call: if (retTy == Ity_V128 || retTy == Ity_V256) { vassert(nVECRETs == 1); } else { vassert(nVECRETs == 0); } vassert(nGSPTRs == 0 || nGSPTRs == 1); vassert(*stackAdjustAfterCall == 0); vassert(is_RetLoc_INVALID(*retloc)); switch (retTy) { case Ity_INVALID: /* Function doesn't return a value. */ *retloc = mk_RetLoc_simple(RLPri_None); break; case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: *retloc = mk_RetLoc_simple(RLPri_Int); break; case Ity_V128: *retloc = mk_RetLoc_spRel(RLPri_V128SpRel, 0); *stackAdjustAfterCall = 16; break; case Ity_V256: *retloc = mk_RetLoc_spRel(RLPri_V256SpRel, 0); *stackAdjustAfterCall = 32; break; default: /* IR can denote other possible return types, but we don't handle those here. */ vassert(0); } /* Finally, generate the call itself. This needs the *retloc value set in the switch above, which is why it's at the end. */ addInstr(env, AMD64Instr_Call(cc, (Addr)cee->addr, n_args, *retloc)); } /* Given a guest-state array descriptor, an index expression and a bias, generate an AMD64AMode holding the relevant guest state offset. */ static AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr, IRExpr* off, Int bias ) { HReg tmp, roff; Int elemSz = sizeofIRType(descr->elemTy); Int nElems = descr->nElems; /* Throw out any cases not generated by an amd64 front end. In theory there might be a day where we need to handle them -- if we ever run non-amd64-guest on amd64 host. */ if (nElems != 8 || (elemSz != 1 && elemSz != 8)) vpanic("genGuestArrayOffset(amd64 host)"); /* Compute off into a reg, %off. Then return: movq %off, %tmp addq $bias, %tmp (if bias != 0) andq %tmp, 7 ... base(%rbp, %tmp, shift) ... */ tmp = newVRegI(env); roff = iselIntExpr_R(env, off); addInstr(env, mk_iMOVsd_RR(roff, tmp)); if (bias != 0) { /* Make sure the bias is sane, in the sense that there are no significant bits above bit 30 in it. */ vassert(-10000 < bias && bias < 10000); addInstr(env, AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp)); } addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp)); vassert(elemSz == 1 || elemSz == 8); return AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp, elemSz==8 ? 3 : 0); } /* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */ static void set_SSE_rounding_default ( ISelEnv* env ) { /* pushq $DEFAULT_MXCSR ldmxcsr 0(%rsp) addq $8, %rsp */ AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP()); addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR))); addInstr(env, AMD64Instr_LdMXCSR(zero_rsp)); add_to_rsp(env, 8); } /* Mess with the FPU's rounding mode: set to the default rounding mode (DEFAULT_FPUCW). */ static void set_FPU_rounding_default ( ISelEnv* env ) { /* movq $DEFAULT_FPUCW, -8(%rsp) fldcw -8(%esp) */ AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP()); addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp)); addInstr(env, AMD64Instr_A87LdCW(m8_rsp)); } /* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed expression denoting a value in the range 0 .. 3, indicating a round mode encoded as per type IRRoundingMode. Set the SSE machinery to have the same rounding. */ static void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode ) { /* Note: this sequence only makes sense because DEFAULT_MXCSR has both rounding bits == 0. If that wasn't the case, we couldn't create a new rounding field simply by ORing the new value into place. */ /* movq $3, %reg andq [[mode]], %reg -- shouldn't be needed; paranoia shlq $13, %reg orq $DEFAULT_MXCSR, %reg pushq %reg ldmxcsr 0(%esp) addq $8, %rsp */ HReg reg = newVRegI(env); AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP()); addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg)); addInstr(env, AMD64Instr_Alu64R(Aalu_AND, iselIntExpr_RMI(env, mode), reg)); addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg)); addInstr(env, AMD64Instr_Alu64R( Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg)); addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg))); addInstr(env, AMD64Instr_LdMXCSR(zero_rsp)); add_to_rsp(env, 8); } /* Mess with the FPU's rounding mode: 'mode' is an I32-typed expression denoting a value in the range 0 .. 3, indicating a round mode encoded as per type IRRoundingMode. Set the x87 FPU to have the same rounding. */ static void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode ) { HReg rrm = iselIntExpr_R(env, mode); HReg rrm2 = newVRegI(env); AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP()); /* movq %rrm, %rrm2 andq $3, %rrm2 -- shouldn't be needed; paranoia shlq $10, %rrm2 orq $DEFAULT_FPUCW, %rrm2 movq %rrm2, -8(%rsp) fldcw -8(%esp) */ addInstr(env, mk_iMOVsd_RR(rrm, rrm2)); addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2)); addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2)); addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Imm(DEFAULT_FPUCW), rrm2)); addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(rrm2), m8_rsp)); addInstr(env, AMD64Instr_A87LdCW(m8_rsp)); } /* Generate all-zeroes into a new vector register. */ static HReg generate_zeroes_V128 ( ISelEnv* env ) { HReg dst = newVRegV(env); addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst)); return dst; } /* Generate all-ones into a new vector register. */ static HReg generate_ones_V128 ( ISelEnv* env ) { HReg dst = newVRegV(env); addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst)); return dst; } /* Generate !src into a new vector register. Amazing that there isn't a less crappy way to do this. */ static HReg do_sse_NotV128 ( ISelEnv* env, HReg src ) { HReg dst = generate_ones_V128(env); addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst)); return dst; } /* Expand the given byte into a 64-bit word, by cloning each bit 8 times. */ static ULong bitmask8_to_bytemask64 ( UShort w8 ) { vassert(w8 == (w8 & 0xFF)); ULong w64 = 0; Int i; for (i = 0; i < 8; i++) { if (w8 & (1<type_env,e); switch (ty) { case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: break; default: vassert(0); } switch (e->tag) { /* --------- TEMP --------- */ case Iex_RdTmp: { return lookupIRTemp(env, e->Iex.RdTmp.tmp); } /* --------- LOAD --------- */ case Iex_Load: { HReg dst = newVRegI(env); AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr ); /* We can't handle big-endian loads, nor load-linked. */ if (e->Iex.Load.end != Iend_LE) goto irreducible; if (ty == Ity_I64) { addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Mem(amode), dst) ); return dst; } if (ty == Ity_I32) { addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst)); return dst; } if (ty == Ity_I16) { addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst)); return dst; } if (ty == Ity_I8) { addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst)); return dst; } break; } /* --------- BINARY OP --------- */ case Iex_Binop: { AMD64AluOp aluOp; AMD64ShiftOp shOp; /* Pattern: Sub64(0,x) */ /* and: Sub32(0,x) */ if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1)) || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) { HReg dst = newVRegI(env); HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2); addInstr(env, mk_iMOVsd_RR(reg,dst)); addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst)); return dst; } /* Is it an addition or logical style op? */ switch (e->Iex.Binop.op) { case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64: aluOp = Aalu_ADD; break; case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64: aluOp = Aalu_SUB; break; case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64: aluOp = Aalu_AND; break; case Iop_Or8: case Iop_Or16: case Iop_Or32: case Iop_Or64: aluOp = Aalu_OR; break; case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64: aluOp = Aalu_XOR; break; case Iop_Mul16: case Iop_Mul32: case Iop_Mul64: aluOp = Aalu_MUL; break; default: aluOp = Aalu_INVALID; break; } /* For commutative ops we assume any literal values are on the second operand. */ if (aluOp != Aalu_INVALID) { HReg dst = newVRegI(env); HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg1); AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2); addInstr(env, mk_iMOVsd_RR(reg,dst)); addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst)); return dst; } /* Perhaps a shift op? */ switch (e->Iex.Binop.op) { case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8: shOp = Ash_SHL; break; case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8: shOp = Ash_SHR; break; case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8: shOp = Ash_SAR; break; default: shOp = Ash_INVALID; break; } if (shOp != Ash_INVALID) { HReg dst = newVRegI(env); /* regL = the value to be shifted */ HReg regL = iselIntExpr_R(env, e->Iex.Binop.arg1); addInstr(env, mk_iMOVsd_RR(regL,dst)); /* Do any necessary widening for 16/8 bit operands. Also decide on the final width at which the shift is to be done. */ Bool shift64 = False; switch (e->Iex.Binop.op) { case Iop_Shr64: case Iop_Shl64: case Iop_Sar64: shift64 = True; break; case Iop_Shl32: case Iop_Shl16: case Iop_Shl8: break; case Iop_Shr8: addInstr(env, AMD64Instr_Alu64R( Aalu_AND, AMD64RMI_Imm(0xFF), dst)); break; case Iop_Shr16: addInstr(env, AMD64Instr_Alu64R( Aalu_AND, AMD64RMI_Imm(0xFFFF), dst)); break; case Iop_Shr32: break; case Iop_Sar8: addInstr(env, AMD64Instr_Sh32(Ash_SHL, 24, dst)); addInstr(env, AMD64Instr_Sh32(Ash_SAR, 24, dst)); break; case Iop_Sar16: addInstr(env, AMD64Instr_Sh32(Ash_SHL, 16, dst)); addInstr(env, AMD64Instr_Sh32(Ash_SAR, 16, dst)); break; case Iop_Sar32: break; default: ppIROp(e->Iex.Binop.op); vassert(0); } /* Now consider the shift amount. If it's a literal, we can do a much better job than the general case. */ if (e->Iex.Binop.arg2->tag == Iex_Const) { /* assert that the IR is well-typed */ Int nshift; vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8); nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8; vassert(nshift >= 0); if (nshift > 0) { /* Can't allow nshift==0 since that means %cl */ if (shift64) { addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst)); } else { addInstr(env, AMD64Instr_Sh32(shOp, nshift, dst)); } } } else { /* General case; we have to force the amount into %cl. */ HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2); addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX())); if (shift64) { addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst)); } else { addInstr(env, AMD64Instr_Sh32(shOp, 0/* %cl */, dst)); } } return dst; } /* Handle misc other scalar ops. */ if (e->Iex.Binop.op == Iop_Max32U) { HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1); HReg dst = newVRegI(env); HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2); addInstr(env, mk_iMOVsd_RR(src1, dst)); addInstr(env, AMD64Instr_Alu32R(Aalu_CMP, AMD64RMI_Reg(src2), dst)); addInstr(env, AMD64Instr_CMov64(Acc_B, src2, dst)); return dst; } if (e->Iex.Binop.op == Iop_DivModS64to32 || e->Iex.Binop.op == Iop_DivModU64to32) { /* 64 x 32 -> (32(rem),32(div)) division */ /* Get the 64-bit operand into edx:eax, and the other into any old R/M. */ HReg rax = hregAMD64_RAX(); HReg rdx = hregAMD64_RDX(); HReg dst = newVRegI(env); Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS64to32); AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2); /* Compute the left operand into a reg, and then put the top half in edx and the bottom in eax. */ HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1); addInstr(env, mk_iMOVsd_RR(left64, rdx)); addInstr(env, mk_iMOVsd_RR(left64, rax)); addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx)); addInstr(env, AMD64Instr_Div(syned, 4, rmRight)); addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx)); addInstr(env, AMD64Instr_MovxLQ(False, rax, rax)); addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx)); addInstr(env, mk_iMOVsd_RR(rax, dst)); addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst)); return dst; } if (e->Iex.Binop.op == Iop_32HLto64) { HReg hi32 = newVRegI(env); HReg lo32 = newVRegI(env); HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1); HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2); addInstr(env, mk_iMOVsd_RR(hi32s, hi32)); addInstr(env, mk_iMOVsd_RR(lo32s, lo32)); addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32)); addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32)); addInstr(env, AMD64Instr_Alu64R( Aalu_OR, AMD64RMI_Reg(lo32), hi32)); return hi32; } if (e->Iex.Binop.op == Iop_16HLto32) { HReg hi16 = newVRegI(env); HReg lo16 = newVRegI(env); HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1); HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2); addInstr(env, mk_iMOVsd_RR(hi16s, hi16)); addInstr(env, mk_iMOVsd_RR(lo16s, lo16)); addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16)); addInstr(env, AMD64Instr_Alu64R( Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16)); addInstr(env, AMD64Instr_Alu64R( Aalu_OR, AMD64RMI_Reg(lo16), hi16)); return hi16; } if (e->Iex.Binop.op == Iop_8HLto16) { HReg hi8 = newVRegI(env); HReg lo8 = newVRegI(env); HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1); HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2); addInstr(env, mk_iMOVsd_RR(hi8s, hi8)); addInstr(env, mk_iMOVsd_RR(lo8s, lo8)); addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8)); addInstr(env, AMD64Instr_Alu64R( Aalu_AND, AMD64RMI_Imm(0xFF), lo8)); addInstr(env, AMD64Instr_Alu64R( Aalu_OR, AMD64RMI_Reg(lo8), hi8)); return hi8; } if (e->Iex.Binop.op == Iop_MullS32 || e->Iex.Binop.op == Iop_MullS16 || e->Iex.Binop.op == Iop_MullS8 || e->Iex.Binop.op == Iop_MullU32 || e->Iex.Binop.op == Iop_MullU16 || e->Iex.Binop.op == Iop_MullU8) { HReg a32 = newVRegI(env); HReg b32 = newVRegI(env); HReg a32s = iselIntExpr_R(env, e->Iex.Binop.arg1); HReg b32s = iselIntExpr_R(env, e->Iex.Binop.arg2); Int shift = 0; AMD64ShiftOp shr_op = Ash_SHR; switch (e->Iex.Binop.op) { case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break; case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break; case Iop_MullS8: shr_op = Ash_SAR; shift = 56; break; case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break; case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break; case Iop_MullU8: shr_op = Ash_SHR; shift = 56; break; default: vassert(0); } addInstr(env, mk_iMOVsd_RR(a32s, a32)); addInstr(env, mk_iMOVsd_RR(b32s, b32)); addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32)); addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32)); addInstr(env, AMD64Instr_Sh64(shr_op, shift, a32)); addInstr(env, AMD64Instr_Sh64(shr_op, shift, b32)); addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32)); return b32; } if (e->Iex.Binop.op == Iop_CmpF64) { HReg fL = iselDblExpr(env, e->Iex.Binop.arg1); HReg fR = iselDblExpr(env, e->Iex.Binop.arg2); HReg dst = newVRegI(env); addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst)); /* Mask out irrelevant parts of the result so as to conform to the CmpF64 definition. */ addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst)); return dst; } if (e->Iex.Binop.op == Iop_F64toI32S || e->Iex.Binop.op == Iop_F64toI64S) { Int szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8; HReg rf = iselDblExpr(env, e->Iex.Binop.arg2); HReg dst = newVRegI(env); set_SSE_rounding_mode( env, e->Iex.Binop.arg1 ); addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst )); set_SSE_rounding_default(env); return dst; } /* Deal with 64-bit SIMD binary ops. For the most part these are doable by using the equivalent 128-bit operation and ignoring the upper half of the result. */ AMD64SseOp op = Asse_INVALID; Bool arg1isEReg = False; Bool preShift32R = False; switch (e->Iex.Binop.op) { // The following 3 could be done with 128 bit insns too, but // first require the inputs to be reformatted. //case Iop_QNarrowBin32Sto16Sx4: //op = Asse_PACKSSD; arg1isEReg = True; break; //case Iop_QNarrowBin16Sto8Sx8: //op = Asse_PACKSSW; arg1isEReg = True; break; //case Iop_QNarrowBin16Sto8Ux8: //op = Asse_PACKUSW; arg1isEReg = True; break; case Iop_InterleaveHI8x8: op = Asse_UNPCKLB; arg1isEReg = True; preShift32R = True; break; case Iop_InterleaveHI16x4: op = Asse_UNPCKLW; arg1isEReg = True; preShift32R = True; break; case Iop_InterleaveHI32x2: op = Asse_UNPCKLD; arg1isEReg = True; preShift32R = True; break; case Iop_InterleaveLO8x8: op = Asse_UNPCKLB; arg1isEReg = True; break; case Iop_InterleaveLO16x4: op = Asse_UNPCKLW; arg1isEReg = True; break; case Iop_InterleaveLO32x2: op = Asse_UNPCKLD; arg1isEReg = True; break; case Iop_Add8x8: op = Asse_ADD8; break; case Iop_Add16x4: op = Asse_ADD16; break; case Iop_Add32x2: op = Asse_ADD32; break; case Iop_QAdd8Sx8: op = Asse_QADD8S; break; case Iop_QAdd16Sx4: op = Asse_QADD16S; break; case Iop_QAdd8Ux8: op = Asse_QADD8U; break; case Iop_QAdd16Ux4: op = Asse_QADD16U; break; case Iop_Avg8Ux8: op = Asse_AVG8U; break; case Iop_Avg16Ux4: op = Asse_AVG16U; break; case Iop_CmpEQ8x8: op = Asse_CMPEQ8; break; case Iop_CmpEQ16x4: op = Asse_CMPEQ16; break; case Iop_CmpEQ32x2: op = Asse_CMPEQ32; break; case Iop_CmpGT8Sx8: op = Asse_CMPGT8S; break; case Iop_CmpGT16Sx4: op = Asse_CMPGT16S; break; case Iop_CmpGT32Sx2: op = Asse_CMPGT32S; break; case Iop_Max16Sx4: op = Asse_MAX16S; break; case Iop_Max8Ux8: op = Asse_MAX8U; break; case Iop_Min16Sx4: op = Asse_MIN16S; break; case Iop_Min8Ux8: op = Asse_MIN8U; break; case Iop_MulHi16Ux4: op = Asse_MULHI16U; break; case Iop_MulHi16Sx4: op = Asse_MULHI16S; break; case Iop_Mul16x4: op = Asse_MUL16; break; case Iop_Sub8x8: op = Asse_SUB8; break; case Iop_Sub16x4: op = Asse_SUB16; break; case Iop_Sub32x2: op = Asse_SUB32; break; case Iop_QSub8Sx8: op = Asse_QSUB8S; break; case Iop_QSub16Sx4: op = Asse_QSUB16S; break; case Iop_QSub8Ux8: op = Asse_QSUB8U; break; case Iop_QSub16Ux4: op = Asse_QSUB16U; break; default: break; } if (op != Asse_INVALID) { /* This isn't pretty, but .. move each arg to the low half of an XMM register, do the operation on the whole register, and move the result back to an integer register. */ const IRExpr* arg1 = e->Iex.Binop.arg1; const IRExpr* arg2 = e->Iex.Binop.arg2; vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64); vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64); HReg iarg1 = iselIntExpr_R(env, arg1); HReg iarg2 = iselIntExpr_R(env, arg2); HReg varg1 = newVRegV(env); HReg varg2 = newVRegV(env); HReg idst = newVRegI(env); addInstr(env, AMD64Instr_SseMOVQ(iarg1, varg1, True/*toXMM*/)); addInstr(env, AMD64Instr_SseMOVQ(iarg2, varg2, True/*toXMM*/)); if (arg1isEReg) { if (preShift32R) { addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg1)); addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 32, varg2)); } addInstr(env, AMD64Instr_SseReRg(op, varg1, varg2)); addInstr(env, AMD64Instr_SseMOVQ(idst, varg2, False/*!toXMM*/)); } else { vassert(!preShift32R); addInstr(env, AMD64Instr_SseReRg(op, varg2, varg1)); addInstr(env, AMD64Instr_SseMOVQ(idst, varg1, False/*!toXMM*/)); } return idst; } UInt laneBits = 0; op = Asse_INVALID; switch (e->Iex.Binop.op) { case Iop_ShlN16x4: laneBits = 16; op = Asse_SHL16; break; case Iop_ShlN32x2: laneBits = 32; op = Asse_SHL32; break; case Iop_SarN16x4: laneBits = 16; op = Asse_SAR16; break; case Iop_SarN32x2: laneBits = 32; op = Asse_SAR32; break; case Iop_ShrN16x4: laneBits = 16; op = Asse_SHR16; break; case Iop_ShrN32x2: laneBits = 32; op = Asse_SHR32; break; default: break; } if (op != Asse_INVALID) { const IRExpr* arg1 = e->Iex.Binop.arg1; const IRExpr* arg2 = e->Iex.Binop.arg2; vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64); vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I8); HReg igreg = iselIntExpr_R(env, arg1); HReg vgreg = newVRegV(env); HReg idst = newVRegI(env); addInstr(env, AMD64Instr_SseMOVQ(igreg, vgreg, True/*toXMM*/)); /* If it's a shift by an in-range immediate, generate a single instruction. */ if (arg2->tag == Iex_Const) { IRConst* c = arg2->Iex.Const.con; vassert(c->tag == Ico_U8); UInt shift = c->Ico.U8; if (shift < laneBits) { addInstr(env, AMD64Instr_SseShiftN(op, shift, vgreg)); addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/)); return idst; } } /* Otherwise we have to do it the longwinded way. */ HReg ishift = iselIntExpr_R(env, arg2); HReg vshift = newVRegV(env); addInstr(env, AMD64Instr_SseMOVQ(ishift, vshift, True/*toXMM*/)); addInstr(env, AMD64Instr_SseReRg(op, vshift, vgreg)); addInstr(env, AMD64Instr_SseMOVQ(idst, vgreg, False/*!toXMM*/)); return idst; } if (e->Iex.Binop.op == Iop_Mul32x2) { const IRExpr* arg1 = e->Iex.Binop.arg1; const IRExpr* arg2 = e->Iex.Binop.arg2; vassert(typeOfIRExpr(env->type_env, arg1) == Ity_I64); vassert(typeOfIRExpr(env->type_env, arg2) == Ity_I64); HReg s1 = iselIntExpr_R(env, arg1); HReg s2 = iselIntExpr_R(env, arg2); HReg resLo = newVRegI(env); // resLo = (s1 *64 s2) & 0xFFFF'FFFF addInstr(env, mk_iMOVsd_RR(s1, resLo)); addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(s2), resLo)); addInstr(env, AMD64Instr_MovxLQ(False, resLo, resLo)); // resHi = ((s1 >>u 32) *64 (s2 >>u 32)) << 32; HReg resHi = newVRegI(env); addInstr(env, mk_iMOVsd_RR(s1, resHi)); addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, resHi)); HReg tmp = newVRegI(env); addInstr(env, mk_iMOVsd_RR(s2, tmp)); addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, tmp)); addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(tmp), resHi)); addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, resHi)); // final result = resHi | resLo addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(resHi), resLo)); return resLo; } // A few remaining SIMD64 ops require helper functions, at least for // now. Bool second_is_UInt = False; HWord fn = 0; switch (e->Iex.Binop.op) { case Iop_CatOddLanes16x4: fn = (HWord)h_generic_calc_CatOddLanes16x4; break; case Iop_CatEvenLanes16x4: fn = (HWord)h_generic_calc_CatEvenLanes16x4; break; case Iop_PermOrZero8x8: fn = (HWord)h_generic_calc_PermOrZero8x8; break; case Iop_QNarrowBin32Sto16Sx4: fn = (HWord)h_generic_calc_QNarrowBin32Sto16Sx4; break; case Iop_QNarrowBin16Sto8Sx8: fn = (HWord)h_generic_calc_QNarrowBin16Sto8Sx8; break; case Iop_QNarrowBin16Sto8Ux8: fn = (HWord)h_generic_calc_QNarrowBin16Sto8Ux8; break; case Iop_NarrowBin16to8x8: fn = (HWord)h_generic_calc_NarrowBin16to8x8; break; case Iop_NarrowBin32to16x4: fn = (HWord)h_generic_calc_NarrowBin32to16x4; break; case Iop_SarN8x8: fn = (HWord)h_generic_calc_SarN8x8; second_is_UInt = True; break; default: fn = (HWord)0; break; } if (fn != (HWord)0) { /* Note: the following assumes all helpers are of signature ULong fn ( ULong, ULong ), and they are not marked as regparm functions. */ HReg dst = newVRegI(env); HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1); HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2); if (second_is_UInt) addInstr(env, AMD64Instr_MovxLQ(False, argR, argR)); addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) ); addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) ); addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2, mk_RetLoc_simple(RLPri_Int) )); addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst)); return dst; } // Half-float vector conversion if (e->Iex.Binop.op == Iop_F32toF16x4 && (env->hwcaps & VEX_HWCAPS_AMD64_F16C)) { HReg srcV = iselVecExpr(env, e->Iex.Binop.arg2); HReg dstV = newVRegV(env); HReg dstI = newVRegI(env); set_SSE_rounding_mode( env, e->Iex.Binop.arg1 ); addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcV, dstV)); set_SSE_rounding_default(env); addInstr(env, AMD64Instr_SseMOVQ(dstI, dstV, /*toXMM=*/False)); return dstI; } break; } /* --------- UNARY OP --------- */ case Iex_Unop: { /* 1Uto8(64to1(expr64)) */ { DEFINE_PATTERN( p_1Uto8_64to1, unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) ); if (matchIRExpr(&mi,p_1Uto8_64to1,e)) { const IRExpr* expr64 = mi.bindee[0]; HReg dst = newVRegI(env); HReg src = iselIntExpr_R(env, expr64); addInstr(env, mk_iMOVsd_RR(src,dst) ); addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(1), dst)); return dst; } } /* 8Uto64(LDle(expr64)) */ { DEFINE_PATTERN(p_LDle8_then_8Uto64, unop(Iop_8Uto64, IRExpr_Load(Iend_LE,Ity_I8,bind(0))) ); if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) { HReg dst = newVRegI(env); AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] ); addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst)); return dst; } } /* 16Uto64(LDle(expr64)) */ { DEFINE_PATTERN(p_LDle16_then_16Uto64, unop(Iop_16Uto64, IRExpr_Load(Iend_LE,Ity_I16,bind(0))) ); if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) { HReg dst = newVRegI(env); AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] ); addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst)); return dst; } } /* 32Uto64( Add32/Sub32/And32/Or32/Xor32(expr32, expr32) ) Use 32 bit arithmetic and let the default zero-extend rule do the 32Uto64 for free. */ if (e->Iex.Unop.op == Iop_32Uto64 && e->Iex.Unop.arg->tag == Iex_Binop) { IROp opi = e->Iex.Unop.arg->Iex.Binop.op; /* inner op */ IRExpr* argL = e->Iex.Unop.arg->Iex.Binop.arg1; IRExpr* argR = e->Iex.Unop.arg->Iex.Binop.arg2; AMD64AluOp aluOp = Aalu_INVALID; switch (opi) { case Iop_Add32: aluOp = Aalu_ADD; break; case Iop_Sub32: aluOp = Aalu_SUB; break; case Iop_And32: aluOp = Aalu_AND; break; case Iop_Or32: aluOp = Aalu_OR; break; case Iop_Xor32: aluOp = Aalu_XOR; break; default: break; } if (aluOp != Aalu_INVALID) { /* For commutative ops we assume any literal values are on the second operand. */ HReg dst = newVRegI(env); HReg reg = iselIntExpr_R(env, argL); AMD64RMI* rmi = iselIntExpr_RMI(env, argR); addInstr(env, mk_iMOVsd_RR(reg,dst)); addInstr(env, AMD64Instr_Alu32R(aluOp, rmi, dst)); return dst; } /* just fall through to normal handling for Iop_32Uto64 */ } /* Fallback cases */ switch (e->Iex.Unop.op) { case Iop_32Uto64: case Iop_32Sto64: { HReg dst = newVRegI(env); HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64, src, dst) ); return dst; } case Iop_128HIto64: { HReg rHi, rLo; iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg); return rHi; /* and abandon rLo */ } case Iop_128to64: { HReg rHi, rLo; iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg); return rLo; /* and abandon rHi */ } case Iop_8Uto16: case Iop_8Uto32: case Iop_8Uto64: case Iop_16Uto64: case Iop_16Uto32: { HReg dst = newVRegI(env); HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32 || e->Iex.Unop.op==Iop_16Uto64 ); UInt mask = srcIs16 ? 0xFFFF : 0xFF; addInstr(env, mk_iMOVsd_RR(src,dst) ); addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(mask), dst)); return dst; } case Iop_8Sto16: case Iop_8Sto64: case Iop_8Sto32: case Iop_16Sto32: case Iop_16Sto64: { HReg dst = newVRegI(env); HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32 || e->Iex.Unop.op==Iop_16Sto64 ); UInt amt = srcIs16 ? 48 : 56; addInstr(env, mk_iMOVsd_RR(src,dst) ); addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst)); addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst)); return dst; } case Iop_Not8: case Iop_Not16: case Iop_Not32: case Iop_Not64: { HReg dst = newVRegI(env); HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); addInstr(env, mk_iMOVsd_RR(src,dst) ); addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst)); return dst; } case Iop_16HIto8: case Iop_32HIto16: case Iop_64HIto32: { HReg dst = newVRegI(env); HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); Int shift = 0; switch (e->Iex.Unop.op) { case Iop_16HIto8: shift = 8; break; case Iop_32HIto16: shift = 16; break; case Iop_64HIto32: shift = 32; break; default: vassert(0); } addInstr(env, mk_iMOVsd_RR(src,dst) ); addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst)); return dst; } case Iop_1Uto64: case Iop_1Uto32: case Iop_1Uto8: { HReg dst = newVRegI(env); AMD64CondCode cond = iselCondCode_C(env, e->Iex.Unop.arg); addInstr(env, AMD64Instr_Set64(cond,dst)); return dst; } case Iop_1Sto8: case Iop_1Sto16: case Iop_1Sto32: case Iop_1Sto64: { HReg dst = newVRegI(env); HReg tmp = iselCondCode_R(env, e->Iex.Unop.arg); addInstr(env, mk_iMOVsd_RR(tmp, dst)); addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst)); addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst)); return dst; } case Iop_Ctz64: { /* Count trailing zeroes, implemented by amd64 'bsfq' */ HReg dst = newVRegI(env); HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); addInstr(env, AMD64Instr_Bsfr64(True,src,dst)); return dst; } case Iop_Clz64: { /* Count leading zeroes. Do 'bsrq' to establish the index of the highest set bit, and subtract that value from 63. */ HReg tmp = newVRegI(env); HReg dst = newVRegI(env); HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); addInstr(env, AMD64Instr_Bsfr64(False,src,tmp)); addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(63), dst)); addInstr(env, AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Reg(tmp), dst)); return dst; } case Iop_CmpwNEZ64: { HReg dst = newVRegI(env); HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); addInstr(env, mk_iMOVsd_RR(src,dst)); addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst)); addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst)); addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst)); return dst; } case Iop_CmpwNEZ32: { HReg src = newVRegI(env); HReg dst = newVRegI(env); HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg); addInstr(env, mk_iMOVsd_RR(pre,src)); addInstr(env, AMD64Instr_MovxLQ(False, src, src)); addInstr(env, mk_iMOVsd_RR(src,dst)); addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst)); addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst)); addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst)); return dst; } case Iop_Left8: case Iop_Left16: case Iop_Left32: case Iop_Left64: { HReg dst = newVRegI(env); HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); addInstr(env, mk_iMOVsd_RR(src, dst)); addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst)); addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst)); return dst; } case Iop_V128to32: { HReg dst = newVRegI(env); HReg vec = iselVecExpr(env, e->Iex.Unop.arg); AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP()); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16)); addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst)); return dst; } /* V128{HI}to64 */ case Iop_V128to64: { HReg dst = newVRegI(env); HReg vec = iselVecExpr(env, e->Iex.Unop.arg); addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/)); return dst; } case Iop_V128HIto64: { HReg dst = newVRegI(env); HReg vec = iselVecExpr(env, e->Iex.Unop.arg); HReg vec2 = newVRegV(env); addInstr(env, mk_vMOVsd_RR(vec, vec2)); addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2)); addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/)); return dst; } /* V256to64_{3,2,1,0} */ case Iop_V256to64_0: case Iop_V256to64_1: case Iop_V256to64_2: case Iop_V256to64_3: { HReg vHi, vLo, vec; iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg); /* Do the first part of the selection by deciding which of the 128 bit registers to look at, and second part using the same scheme as for V128{HI}to64 above. */ Bool low64of128 = True; switch (e->Iex.Unop.op) { case Iop_V256to64_0: vec = vLo; low64of128 = True; break; case Iop_V256to64_1: vec = vLo; low64of128 = False; break; case Iop_V256to64_2: vec = vHi; low64of128 = True; break; case Iop_V256to64_3: vec = vHi; low64of128 = False; break; default: vassert(0); } HReg dst = newVRegI(env); if (low64of128) { addInstr(env, AMD64Instr_SseMOVQ(dst, vec, False/*!toXMM*/)); } else { HReg vec2 = newVRegV(env); addInstr(env, mk_vMOVsd_RR(vec, vec2)); addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, vec2)); addInstr(env, AMD64Instr_SseMOVQ(dst, vec2, False/*!toXMM*/)); } return dst; } /* ReinterpF64asI64(e) */ /* Given an IEEE754 double, produce an I64 with the same bit pattern. */ case Iop_ReinterpF64asI64: { AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP()); HReg dst = newVRegI(env); HReg src = iselDblExpr(env, e->Iex.Unop.arg); /* paranoia */ set_SSE_rounding_default(env); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp)); addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst)); return dst; } /* ReinterpF32asI32(e) */ /* Given an IEEE754 single, produce an I64 with the same bit pattern in the lower half. */ case Iop_ReinterpF32asI32: { AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP()); HReg dst = newVRegI(env); HReg src = iselFltExpr(env, e->Iex.Unop.arg); /* paranoia */ set_SSE_rounding_default(env); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp)); addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst )); return dst; } case Iop_16to8: case Iop_32to8: case Iop_64to8: case Iop_32to16: case Iop_64to16: case Iop_64to32: /* These are no-ops. */ return iselIntExpr_R(env, e->Iex.Unop.arg); case Iop_GetMSBs8x8: { /* Note: the following assumes the helper is of signature UInt fn ( ULong ), and is not a regparm fn. */ HReg dst = newVRegI(env); HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg); HWord fn = (HWord)h_generic_calc_GetMSBs8x8; addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) ); addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1, mk_RetLoc_simple(RLPri_Int) )); /* MovxLQ is not exactly the right thing here. We just need to get the bottom 8 bits of RAX into dst, and zero out everything else. Assuming that the helper returns a UInt with the top 24 bits zeroed out, it'll do, though. */ addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst)); return dst; } case Iop_GetMSBs8x16: { /* Note: the following assumes the helper is of signature UInt fn ( ULong w64hi, ULong w64Lo ), and is not a regparm fn. */ HReg dst = newVRegI(env); HReg vec = iselVecExpr(env, e->Iex.Unop.arg); HReg rsp = hregAMD64_RSP(); HWord fn = (HWord)h_generic_calc_GetMSBs8x16; AMD64AMode* m8_rsp = AMD64AMode_IR( -8, rsp); AMD64AMode* m16_rsp = AMD64AMode_IR(-16, rsp); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, m16_rsp)); /* hi 64 bits into RDI -- the first arg */ addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(m8_rsp), hregAMD64_RDI() )); /* 1st arg */ /* lo 64 bits into RSI -- the 2nd arg */ addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(m16_rsp), hregAMD64_RSI() )); /* 2nd arg */ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2, mk_RetLoc_simple(RLPri_Int) )); /* MovxLQ is not exactly the right thing here. We just need to get the bottom 16 bits of RAX into dst, and zero out everything else. Assuming that the helper returns a UInt with the top 16 bits zeroed out, it'll do, though. */ addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst)); return dst; } default: break; } /* Deal with unary 64-bit SIMD ops. */ HWord fn = 0; switch (e->Iex.Unop.op) { case Iop_CmpNEZ32x2: fn = (HWord)h_generic_calc_CmpNEZ32x2; break; case Iop_CmpNEZ16x4: fn = (HWord)h_generic_calc_CmpNEZ16x4; break; case Iop_CmpNEZ8x8: fn = (HWord)h_generic_calc_CmpNEZ8x8; break; default: fn = (HWord)0; break; } if (fn != (HWord)0) { /* Note: the following assumes all helpers are of signature ULong fn ( ULong ), and they are not marked as regparm functions. */ HReg dst = newVRegI(env); HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg); addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) ); addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1, mk_RetLoc_simple(RLPri_Int) )); addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst)); return dst; } break; } /* --------- GET --------- */ case Iex_Get: { if (ty == Ity_I64) { HReg dst = newVRegI(env); addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem( AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())), dst)); return dst; } if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) { HReg dst = newVRegI(env); addInstr(env, AMD64Instr_LoadEX( toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)), False, AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()), dst)); return dst; } break; } case Iex_GetI: { AMD64AMode* am = genGuestArrayOffset( env, e->Iex.GetI.descr, e->Iex.GetI.ix, e->Iex.GetI.bias ); HReg dst = newVRegI(env); if (ty == Ity_I8) { addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst )); return dst; } if (ty == Ity_I64) { addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst )); return dst; } break; } /* --------- CCALL --------- */ case Iex_CCall: { HReg dst = newVRegI(env); vassert(ty == e->Iex.CCall.retty); /* be very restrictive for now. Only 64-bit ints allowed for args, and 64 or 32 bits for return type. */ if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32) goto irreducible; /* Marshal args, do the call. */ UInt addToSp = 0; RetLoc rloc = mk_RetLoc_INVALID(); doHelperCall( &addToSp, &rloc, env, NULL/*guard*/, e->Iex.CCall.cee, e->Iex.CCall.retty, e->Iex.CCall.args ); vassert(is_sane_RetLoc(rloc)); vassert(rloc.pri == RLPri_Int); vassert(addToSp == 0); /* Move to dst, and zero out the top 32 bits if the result type is Ity_I32. Probably overkill, but still .. */ if (e->Iex.CCall.retty == Ity_I64) addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst)); else addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst)); return dst; } /* --------- LITERAL --------- */ /* 64/32/16/8-bit literals */ case Iex_Const: if (ty == Ity_I64) { HReg r = newVRegI(env); addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r)); return r; } else { AMD64RMI* rmi = iselIntExpr_RMI ( env, e ); HReg r = newVRegI(env); addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r)); return r; } /* --------- MULTIPLEX --------- */ case Iex_ITE: { // VFD if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) && typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1) { HReg r1 = iselIntExpr_R(env, e->Iex.ITE.iftrue); HReg r0 = iselIntExpr_R(env, e->Iex.ITE.iffalse); HReg dst = newVRegI(env); addInstr(env, mk_iMOVsd_RR(r1,dst)); AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond); addInstr(env, AMD64Instr_CMov64(cc ^ 1, r0, dst)); return dst; } break; } /* --------- TERNARY OP --------- */ case Iex_Triop: { IRTriop *triop = e->Iex.Triop.details; /* C3210 flags following FPU partial remainder (fprem), both IEEE compliant (PREM1) and non-IEEE compliant (PREM). */ if (triop->op == Iop_PRemC3210F64 || triop->op == Iop_PRem1C3210F64) { AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP()); HReg arg1 = iselDblExpr(env, triop->arg2); HReg arg2 = iselDblExpr(env, triop->arg3); HReg dst = newVRegI(env); addInstr(env, AMD64Instr_A87Free(2)); /* one arg -> top of x87 stack */ addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp)); addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8)); /* other arg -> top of x87 stack */ addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp)); addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8)); switch (triop->op) { case Iop_PRemC3210F64: addInstr(env, AMD64Instr_A87FpOp(Afp_PREM)); break; case Iop_PRem1C3210F64: addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1)); break; default: vassert(0); } /* Ignore the result, and instead make off with the FPU's C3210 flags (in the status word). */ addInstr(env, AMD64Instr_A87StSW(m8_rsp)); addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst)); addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst)); return dst; } break; } default: break; } /* switch (e->tag) */ /* We get here if no pattern matched. */ irreducible: ppIRExpr(e); vpanic("iselIntExpr_R(amd64): cannot reduce tree"); } /*---------------------------------------------------------*/ /*--- ISEL: Integer expression auxiliaries ---*/ /*---------------------------------------------------------*/ /* --------------------- AMODEs --------------------- */ /* Return an AMode which computes the value of the specified expression, possibly also adding insns to the code list as a result. The expression may only be a 32-bit one. */ static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, const IRExpr* e ) { AMD64AMode* am = iselIntExpr_AMode_wrk(env, e); vassert(sane_AMode(am)); return am; } /* DO NOT CALL THIS DIRECTLY ! */ static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, const IRExpr* e ) { MatchInfo mi; DECLARE_PATTERN(p_complex); IRType ty = typeOfIRExpr(env->type_env,e); vassert(ty == Ity_I64); /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */ /* bind0 bind1 bind2 bind3 */ DEFINE_PATTERN(p_complex, binop( Iop_Add64, binop( Iop_Add64, bind(0), binop(Iop_Shl64, bind(1), bind(2)) ), bind(3) ) ); if (matchIRExpr(&mi, p_complex, e)) { const IRExpr* expr1 = mi.bindee[0]; const IRExpr* expr2 = mi.bindee[1]; const IRExpr* imm8 = mi.bindee[2]; const IRExpr* simm32 = mi.bindee[3]; if (imm8->tag == Iex_Const && imm8->Iex.Const.con->tag == Ico_U8 && imm8->Iex.Const.con->Ico.U8 < 4 /* imm8 is OK, now check simm32 */ && simm32->tag == Iex_Const && simm32->Iex.Const.con->tag == Ico_U64 && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) { UInt shift = imm8->Iex.Const.con->Ico.U8; UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64); HReg r1 = iselIntExpr_R(env, expr1); HReg r2 = iselIntExpr_R(env, expr2); vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3); return AMD64AMode_IRRS(offset, r1, r2, shift); } } /* Add64(expr1, Shl64(expr2, imm)) */ if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_Add64 && e->Iex.Binop.arg2->tag == Iex_Binop && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64 && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) { UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8; if (shift == 1 || shift == 2 || shift == 3) { HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1); HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 ); return AMD64AMode_IRRS(0, r1, r2, shift); } } /* Add64(expr,i) */ if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_Add64 && e->Iex.Binop.arg2->tag == Iex_Const && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64 && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) { HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1); return AMD64AMode_IR( toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64), r1 ); } /* Doesn't match anything in particular. Generate it into a register and use that. */ { HReg r1 = iselIntExpr_R(env, e); return AMD64AMode_IR(0, r1); } } /* --------------------- RMIs --------------------- */ /* Similarly, calculate an expression into an X86RMI operand. As with iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */ static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, const IRExpr* e ) { AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e); /* sanity checks ... */ switch (rmi->tag) { case Armi_Imm: return rmi; case Armi_Reg: vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64); vassert(hregIsVirtual(rmi->Armi.Reg.reg)); return rmi; case Armi_Mem: vassert(sane_AMode(rmi->Armi.Mem.am)); return rmi; default: vpanic("iselIntExpr_RMI: unknown amd64 RMI tag"); } } /* DO NOT CALL THIS DIRECTLY ! */ static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, const IRExpr* e ) { IRType ty = typeOfIRExpr(env->type_env,e); vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8); /* special case: immediate 64/32/16/8 */ if (e->tag == Iex_Const) { switch (e->Iex.Const.con->tag) { case Ico_U64: if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) { return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)); } break; case Ico_U32: return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break; case Ico_U16: return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break; case Ico_U8: return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break; default: vpanic("iselIntExpr_RMI.Iex_Const(amd64)"); } } /* special case: 64-bit GET */ if (e->tag == Iex_Get && ty == Ity_I64) { return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())); } /* special case: 64-bit load from memory */ if (e->tag == Iex_Load && ty == Ity_I64 && e->Iex.Load.end == Iend_LE) { AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr); return AMD64RMI_Mem(am); } /* default case: calculate into a register and return that */ { HReg r = iselIntExpr_R ( env, e ); return AMD64RMI_Reg(r); } } /* --------------------- RIs --------------------- */ /* Calculate an expression into an AMD64RI operand. As with iselIntExpr_R, the expression can have type 64, 32, 16 or 8 bits. */ static AMD64RI* iselIntExpr_RI ( ISelEnv* env, const IRExpr* e ) { AMD64RI* ri = iselIntExpr_RI_wrk(env, e); /* sanity checks ... */ switch (ri->tag) { case Ari_Imm: return ri; case Ari_Reg: vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64); vassert(hregIsVirtual(ri->Ari.Reg.reg)); return ri; default: vpanic("iselIntExpr_RI: unknown amd64 RI tag"); } } /* DO NOT CALL THIS DIRECTLY ! */ static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, const IRExpr* e ) { IRType ty = typeOfIRExpr(env->type_env,e); vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8); /* special case: immediate */ if (e->tag == Iex_Const) { switch (e->Iex.Const.con->tag) { case Ico_U64: if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) { return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64)); } break; case Ico_U32: return AMD64RI_Imm(e->Iex.Const.con->Ico.U32); case Ico_U16: return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); case Ico_U8: return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8); default: vpanic("iselIntExpr_RMI.Iex_Const(amd64)"); } } /* default case: calculate into a register and return that */ { HReg r = iselIntExpr_R ( env, e ); return AMD64RI_Reg(r); } } /* --------------------- RMs --------------------- */ /* Similarly, calculate an expression into an AMD64RM operand. As with iselIntExpr_R, the expression can have type 64, 32, 16 or 8 bits. */ static AMD64RM* iselIntExpr_RM ( ISelEnv* env, const IRExpr* e ) { AMD64RM* rm = iselIntExpr_RM_wrk(env, e); /* sanity checks ... */ switch (rm->tag) { case Arm_Reg: vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64); vassert(hregIsVirtual(rm->Arm.Reg.reg)); return rm; case Arm_Mem: vassert(sane_AMode(rm->Arm.Mem.am)); return rm; default: vpanic("iselIntExpr_RM: unknown amd64 RM tag"); } } /* DO NOT CALL THIS DIRECTLY ! */ static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, const IRExpr* e ) { IRType ty = typeOfIRExpr(env->type_env,e); vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8); /* special case: 64-bit GET */ if (e->tag == Iex_Get && ty == Ity_I64) { return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())); } /* special case: load from memory */ /* default case: calculate into a register and return that */ { HReg r = iselIntExpr_R ( env, e ); return AMD64RM_Reg(r); } } /* --------------------- CONDCODE as %rflag test --------------------- */ /* Generate code to evaluated a bit-typed expression, returning the condition code which would correspond when the expression would notionally have returned 1. Note that iselCondCode_C and iselCondCode_R are mutually recursive. For future changes to either of them, take care not to introduce an infinite loop involving the two of them. */ static AMD64CondCode iselCondCode_C ( ISelEnv* env, const IRExpr* e ) { /* Uh, there's nothing we can sanity check here, unfortunately. */ return iselCondCode_C_wrk(env,e); } /* DO NOT CALL THIS DIRECTLY ! */ static AMD64CondCode iselCondCode_C_wrk ( ISelEnv* env, const IRExpr* e ) { vassert(e); vassert(typeOfIRExpr(env->type_env,e) == Ity_I1); /* var */ if (e->tag == Iex_RdTmp) { HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp); addInstr(env, AMD64Instr_Test64(1,r64)); return Acc_NZ; } /* Constant 1:Bit */ if (e->tag == Iex_Const) { HReg r; vassert(e->Iex.Const.con->tag == Ico_U1); vassert(e->Iex.Const.con->Ico.U1 == True || e->Iex.Const.con->Ico.U1 == False); r = newVRegI(env); addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r)); addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r)); return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ; } /* Not1(...) */ if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) { /* Generate code for the arg, and negate the test condition */ return 1 ^ iselCondCode_C(env, e->Iex.Unop.arg); } /* --- patterns rooted at: 64to1 --- */ /* 64to1 */ if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) { HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg); addInstr(env, AMD64Instr_Test64(1,reg)); return Acc_NZ; } /* --- patterns rooted at: 32to1 --- */ /* 32to1 */ if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_32to1) { HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg); addInstr(env, AMD64Instr_Test64(1,reg)); return Acc_NZ; } /* --- patterns rooted at: CmpNEZ8 --- */ /* CmpNEZ8(x) */ if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_CmpNEZ8) { HReg r = iselIntExpr_R(env, e->Iex.Unop.arg); addInstr(env, AMD64Instr_Test64(0xFF,r)); return Acc_NZ; } /* --- patterns rooted at: CmpNEZ16 --- */ /* CmpNEZ16(x) */ if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_CmpNEZ16) { HReg r = iselIntExpr_R(env, e->Iex.Unop.arg); addInstr(env, AMD64Instr_Test64(0xFFFF,r)); return Acc_NZ; } /* --- patterns rooted at: CmpNEZ32 --- */ if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_CmpNEZ32) { IRExpr* arg = e->Iex.Unop.arg; if (arg->tag == Iex_Binop && (arg->Iex.Binop.op == Iop_Or32 || arg->Iex.Binop.op == Iop_And32)) { /* CmpNEZ32(Or32(x,y)) */ /* CmpNEZ32(And32(x,y)) */ HReg r0 = iselIntExpr_R(env, arg->Iex.Binop.arg1); AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2); HReg tmp = newVRegI(env); addInstr(env, mk_iMOVsd_RR(r0, tmp)); addInstr(env, AMD64Instr_Alu32R( arg->Iex.Binop.op == Iop_Or32 ? Aalu_OR : Aalu_AND, rmi1, tmp)); return Acc_NZ; } /* CmpNEZ32(x) */ HReg r1 = iselIntExpr_R(env, arg); AMD64RMI* rmi2 = AMD64RMI_Imm(0); addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1)); return Acc_NZ; } /* --- patterns rooted at: CmpNEZ64 --- */ if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_CmpNEZ64) { IRExpr* arg = e->Iex.Unop.arg; if (arg->tag == Iex_Binop && (arg->Iex.Binop.op == Iop_Or64 || arg->Iex.Binop.op == Iop_And64)) { /* CmpNEZ64(Or64(x,y)) */ /* CmpNEZ64(And64(x,y)) */ HReg r0 = iselIntExpr_R(env, arg->Iex.Binop.arg1); AMD64RMI* rmi1 = iselIntExpr_RMI(env, arg->Iex.Binop.arg2); HReg tmp = newVRegI(env); addInstr(env, mk_iMOVsd_RR(r0, tmp)); addInstr(env, AMD64Instr_Alu64R( arg->Iex.Binop.op == Iop_Or64 ? Aalu_OR : Aalu_AND, rmi1, tmp)); return Acc_NZ; } /* CmpNEZ64(x) */ HReg r1 = iselIntExpr_R(env, arg); AMD64RMI* rmi2 = AMD64RMI_Imm(0); addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1)); return Acc_NZ; } /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */ /* CmpEQ8 / CmpNE8 */ if (e->tag == Iex_Binop && (e->Iex.Binop.op == Iop_CmpEQ8 || e->Iex.Binop.op == Iop_CmpNE8 || e->Iex.Binop.op == Iop_CasCmpEQ8 || e->Iex.Binop.op == Iop_CasCmpNE8)) { if (isZeroU8(e->Iex.Binop.arg2)) { HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1); addInstr(env, AMD64Instr_Test64(0xFF,r1)); switch (e->Iex.Binop.op) { case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z; case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ; default: vpanic("iselCondCode_C(amd64): CmpXX8(expr,0:I8)"); } } else { HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1); AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2); HReg r = newVRegI(env); addInstr(env, mk_iMOVsd_RR(r1,r)); addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r)); addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r)); switch (e->Iex.Binop.op) { case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z; case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ; default: vpanic("iselCondCode_C(amd64): CmpXX8(expr,expr)"); } } } /* CmpEQ16 / CmpNE16 */ if (e->tag == Iex_Binop && (e->Iex.Binop.op == Iop_CmpEQ16 || e->Iex.Binop.op == Iop_CmpNE16 || e->Iex.Binop.op == Iop_CasCmpEQ16 || e->Iex.Binop.op == Iop_CasCmpNE16)) { HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1); AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2); HReg r = newVRegI(env); addInstr(env, mk_iMOVsd_RR(r1,r)); addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r)); addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r)); switch (e->Iex.Binop.op) { case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z; case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ; default: vpanic("iselCondCode_C(amd64): CmpXX16"); } } /* CmpNE64(ccall, 64-bit constant) (--smc-check=all optimisation). Saves a "movq %rax, %tmp" compared to the default route. */ if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_CmpNE64 && e->Iex.Binop.arg1->tag == Iex_CCall && e->Iex.Binop.arg2->tag == Iex_Const) { IRExpr* cal = e->Iex.Binop.arg1; IRExpr* con = e->Iex.Binop.arg2; HReg tmp = newVRegI(env); /* clone & partial-eval of generic Iex_CCall and Iex_Const cases */ vassert(cal->Iex.CCall.retty == Ity_I64); /* else ill-typed IR */ vassert(con->Iex.Const.con->tag == Ico_U64); /* Marshal args, do the call. */ UInt addToSp = 0; RetLoc rloc = mk_RetLoc_INVALID(); doHelperCall( &addToSp, &rloc, env, NULL/*guard*/, cal->Iex.CCall.cee, cal->Iex.CCall.retty, cal->Iex.CCall.args ); vassert(is_sane_RetLoc(rloc)); vassert(rloc.pri == RLPri_Int); vassert(addToSp == 0); /* */ addInstr(env, AMD64Instr_Imm64(con->Iex.Const.con->Ico.U64, tmp)); addInstr(env, AMD64Instr_Alu64R(Aalu_CMP, AMD64RMI_Reg(hregAMD64_RAX()), tmp)); return Acc_NZ; } /* Cmp*64*(x,y) */ if (e->tag == Iex_Binop && (e->Iex.Binop.op == Iop_CmpEQ64 || e->Iex.Binop.op == Iop_CmpNE64 || e->Iex.Binop.op == Iop_CmpLT64S || e->Iex.Binop.op == Iop_CmpLT64U || e->Iex.Binop.op == Iop_CmpLE64S || e->Iex.Binop.op == Iop_CmpLE64U || e->Iex.Binop.op == Iop_CasCmpEQ64 || e->Iex.Binop.op == Iop_CasCmpNE64 || e->Iex.Binop.op == Iop_ExpCmpNE64)) { HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1); AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2); addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1)); switch (e->Iex.Binop.op) { case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z; case Iop_CmpNE64: case Iop_CasCmpNE64: case Iop_ExpCmpNE64: return Acc_NZ; case Iop_CmpLT64S: return Acc_L; case Iop_CmpLT64U: return Acc_B; case Iop_CmpLE64S: return Acc_LE; case Iop_CmpLE64U: return Acc_BE; default: vpanic("iselCondCode_C(amd64): CmpXX64"); } } /* Cmp*32*(x,y) */ if (e->tag == Iex_Binop && (e->Iex.Binop.op == Iop_CmpEQ32 || e->Iex.Binop.op == Iop_CmpNE32 || e->Iex.Binop.op == Iop_CmpLT32S || e->Iex.Binop.op == Iop_CmpLT32U || e->Iex.Binop.op == Iop_CmpLE32S || e->Iex.Binop.op == Iop_CmpLE32U || e->Iex.Binop.op == Iop_CasCmpEQ32 || e->Iex.Binop.op == Iop_CasCmpNE32 || e->Iex.Binop.op == Iop_ExpCmpNE32)) { HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1); AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2); addInstr(env, AMD64Instr_Alu32R(Aalu_CMP,rmi2,r1)); switch (e->Iex.Binop.op) { case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z; case Iop_CmpNE32: case Iop_CasCmpNE32: case Iop_ExpCmpNE32: return Acc_NZ; case Iop_CmpLT32S: return Acc_L; case Iop_CmpLT32U: return Acc_B; case Iop_CmpLE32S: return Acc_LE; case Iop_CmpLE32U: return Acc_BE; default: vpanic("iselCondCode_C(amd64): CmpXX32"); } } /* And1(x,y), Or1(x,y) */ if (e->tag == Iex_Binop && (e->Iex.Binop.op == Iop_And1 || e->Iex.Binop.op == Iop_Or1)) { // Get the result in an int reg, then test the least significant bit. HReg tmp = iselCondCode_R(env, e); addInstr(env, AMD64Instr_Test64(1, tmp)); return Acc_NZ; } ppIRExpr(e); vpanic("iselCondCode_C(amd64)"); } /* --------------------- CONDCODE as int reg --------------------- */ /* Generate code to evaluated a bit-typed expression, returning the resulting value in bit 0 of an integer register. WARNING: all of the other bits in the register can be arbitrary. Callers must mask them off or otherwise ignore them, as necessary. Note that iselCondCode_C and iselCondCode_R are mutually recursive. For future changes to either of them, take care not to introduce an infinite loop involving the two of them. */ static HReg iselCondCode_R ( ISelEnv* env, const IRExpr* e ) { /* Uh, there's nothing we can sanity check here, unfortunately. */ return iselCondCode_R_wrk(env,e); } /* DO NOT CALL THIS DIRECTLY ! */ static HReg iselCondCode_R_wrk ( ISelEnv* env, const IRExpr* e ) { vassert(e); vassert(typeOfIRExpr(env->type_env,e) == Ity_I1); /* var */ if (e->tag == Iex_RdTmp) { return lookupIRTemp(env, e->Iex.RdTmp.tmp); } /* And1(x,y), Or1(x,y) */ if (e->tag == Iex_Binop && (e->Iex.Binop.op == Iop_And1 || e->Iex.Binop.op == Iop_Or1)) { HReg x_as_64 = iselCondCode_R(env, e->Iex.Binop.arg1); HReg y_as_64 = iselCondCode_R(env, e->Iex.Binop.arg2); HReg res = newVRegI(env); addInstr(env, mk_iMOVsd_RR(y_as_64, res)); AMD64AluOp aop = e->Iex.Binop.op == Iop_And1 ? Aalu_AND : Aalu_OR; addInstr(env, AMD64Instr_Alu64R(aop, AMD64RMI_Reg(x_as_64), res)); return res; } /* Anything else, we hand off to iselCondCode_C and force the value into a register. */ HReg res = newVRegI(env); AMD64CondCode cc = iselCondCode_C(env, e); addInstr(env, AMD64Instr_Set64(cc, res)); return res; ppIRExpr(e); vpanic("iselCondCode_R(amd64)"); } /*---------------------------------------------------------*/ /*--- ISEL: Integer expressions (128 bit) ---*/ /*---------------------------------------------------------*/ /* Compute a 128-bit value into a register pair, which is returned as the first two parameters. As with iselIntExpr_R, these may be either real or virtual regs; in any case they must not be changed by subsequent code emitted by the caller. */ static void iselInt128Expr ( HReg* rHi, HReg* rLo, ISelEnv* env, const IRExpr* e ) { iselInt128Expr_wrk(rHi, rLo, env, e); # if 0 vex_printf("\n"); ppIRExpr(e); vex_printf("\n"); # endif vassert(hregClass(*rHi) == HRcInt64); vassert(hregIsVirtual(*rHi)); vassert(hregClass(*rLo) == HRcInt64); vassert(hregIsVirtual(*rLo)); } /* DO NOT CALL THIS DIRECTLY ! */ static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, const IRExpr* e ) { vassert(e); vassert(typeOfIRExpr(env->type_env,e) == Ity_I128); /* read 128-bit IRTemp */ if (e->tag == Iex_RdTmp) { lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp); return; } /* --------- BINARY ops --------- */ if (e->tag == Iex_Binop) { switch (e->Iex.Binop.op) { /* 64 x 64 -> 128 multiply */ case Iop_MullU64: case Iop_MullS64: { /* get one operand into %rax, and the other into a R/M. Need to make an educated guess about which is better in which. */ HReg tLo = newVRegI(env); HReg tHi = newVRegI(env); Bool syned = toBool(e->Iex.Binop.op == Iop_MullS64); AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1); HReg rRight = iselIntExpr_R(env, e->Iex.Binop.arg2); addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX())); addInstr(env, AMD64Instr_MulL(syned, rmLeft)); /* Result is now in RDX:RAX. Tell the caller. */ addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi)); addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo)); *rHi = tHi; *rLo = tLo; return; } /* 128 x 64 -> (64(rem),64(div)) division */ case Iop_DivModU128to64: case Iop_DivModS128to64: { /* Get the 128-bit operand into rdx:rax, and the other into any old R/M. */ HReg sHi, sLo; HReg tLo = newVRegI(env); HReg tHi = newVRegI(env); Bool syned = toBool(e->Iex.Binop.op == Iop_DivModS128to64); AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2); iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1); addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX())); addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX())); addInstr(env, AMD64Instr_Div(syned, 8, rmRight)); addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi)); addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo)); *rHi = tHi; *rLo = tLo; return; } /* 64HLto128(e1,e2) */ case Iop_64HLto128: *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1); *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2); return; default: break; } } /* if (e->tag == Iex_Binop) */ ppIRExpr(e); vpanic("iselInt128Expr"); } /*---------------------------------------------------------*/ /*--- ISEL: Floating point expressions (32 bit) ---*/ /*---------------------------------------------------------*/ /* Nothing interesting here; really just wrappers for 64-bit stuff. */ static HReg iselFltExpr ( ISelEnv* env, const IRExpr* e ) { HReg r = iselFltExpr_wrk( env, e ); # if 0 vex_printf("\n"); ppIRExpr(e); vex_printf("\n"); # endif vassert(hregClass(r) == HRcVec128); vassert(hregIsVirtual(r)); return r; } /* DO NOT CALL THIS DIRECTLY */ static HReg iselFltExpr_wrk ( ISelEnv* env, const IRExpr* e ) { IRType ty = typeOfIRExpr(env->type_env,e); vassert(ty == Ity_F32); if (e->tag == Iex_RdTmp) { return lookupIRTemp(env, e->Iex.RdTmp.tmp); } if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) { AMD64AMode* am; HReg res = newVRegV(env); vassert(e->Iex.Load.ty == Ity_F32); am = iselIntExpr_AMode(env, e->Iex.Load.addr); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am)); return res; } if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_F64toF32) { /* Although the result is still held in a standard SSE register, we need to round it to reflect the loss of accuracy/range entailed in casting it to a 32-bit float. */ HReg dst = newVRegV(env); HReg src = iselDblExpr(env, e->Iex.Binop.arg2); set_SSE_rounding_mode( env, e->Iex.Binop.arg1 ); addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst)); set_SSE_rounding_default( env ); return dst; } if (e->tag == Iex_Get) { AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset, hregAMD64_RBP() ); HReg res = newVRegV(env); addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am )); return res; } if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_ReinterpI32asF32) { /* Given an I32, produce an IEEE754 float with the same bit pattern. */ HReg dst = newVRegV(env); HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP()); addInstr(env, AMD64Instr_Store(4, src, m4_rsp)); addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp )); return dst; } if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) { AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP()); HReg arg = iselFltExpr(env, e->Iex.Binop.arg2); HReg dst = newVRegV(env); /* rf now holds the value to be rounded. The first thing to do is set the FPU's rounding mode accordingly. */ /* Set host x87 rounding mode */ set_FPU_rounding_mode( env, e->Iex.Binop.arg1 ); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp)); addInstr(env, AMD64Instr_A87Free(1)); addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4)); addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND)); addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp)); /* Restore default x87 rounding. */ set_FPU_rounding_default( env ); return dst; } if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_NegF32) { /* Sigh ... very rough code. Could do much better. */ /* Get the 128-bit literal 00---0 10---0 into a register and xor it with the value to be negated. */ HReg r1 = newVRegI(env); HReg dst = newVRegV(env); HReg tmp = newVRegV(env); HReg src = iselFltExpr(env, e->Iex.Unop.arg); AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); addInstr(env, mk_vMOVsd_RR(src,tmp)); addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0))); addInstr(env, AMD64Instr_Imm64( 1ULL<<31, r1 )); addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1))); addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0)); addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst)); add_to_rsp(env, 16); return dst; } if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF32) { IRQop *qop = e->Iex.Qop.details; HReg dst = newVRegV(env); HReg argX = iselFltExpr(env, qop->arg2); HReg argY = iselFltExpr(env, qop->arg3); HReg argZ = iselFltExpr(env, qop->arg4); /* XXXROUNDINGFIXME */ /* set roundingmode here */ /* subq $16, %rsp -- make a space*/ sub_from_rsp(env, 16); /* Prepare 4 arg regs: leaq 0(%rsp), %rdi leaq 4(%rsp), %rsi leaq 8(%rsp), %rdx leaq 12(%rsp), %rcx */ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()), hregAMD64_RDI())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(4, hregAMD64_RSP()), hregAMD64_RSI())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()), hregAMD64_RDX())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(12, hregAMD64_RSP()), hregAMD64_RCX())); /* Store the three args, at (%rsi), (%rdx) and (%rcx): movss %argX, 0(%rsi) movss %argY, 0(%rdx) movss %argZ, 0(%rcx) */ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argX, AMD64AMode_IR(0, hregAMD64_RSI()))); addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argY, AMD64AMode_IR(0, hregAMD64_RDX()))); addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 4, argZ, AMD64AMode_IR(0, hregAMD64_RCX()))); /* call the helper */ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)(HWord)h_generic_calc_MAddF32, 4, mk_RetLoc_simple(RLPri_None) )); /* fetch the result from memory, using %r_argp, which the register allocator will keep alive across the call. */ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 4, dst, AMD64AMode_IR(0, hregAMD64_RSP()))); /* and finally, clear the space */ add_to_rsp(env, 16); return dst; } if (e->tag == Iex_ITE) { // VFD HReg r1, r0, dst; vassert(ty == Ity_F32); vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1); r1 = iselFltExpr(env, e->Iex.ITE.iftrue); r0 = iselFltExpr(env, e->Iex.ITE.iffalse); dst = newVRegV(env); addInstr(env, mk_vMOVsd_RR(r1,dst)); AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond); addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst)); return dst; } ppIRExpr(e); vpanic("iselFltExpr_wrk"); } /*---------------------------------------------------------*/ /*--- ISEL: Floating point expressions (64 bit) ---*/ /*---------------------------------------------------------*/ /* Compute a 64-bit floating point value into the lower half of an xmm register, the identity of which is returned. As with iselIntExpr_R, the returned reg will be virtual, and it must not be changed by subsequent code emitted by the caller. */ /* IEEE 754 formats. From http://www.freesoft.org/CIE/RFC/1832/32.htm: Type S (1 bit) E (11 bits) F (52 bits) ---- --------- ----------- ----------- signalling NaN u 2047 (max) .0uuuuu---u (with at least one 1 bit) quiet NaN u 2047 (max) .1uuuuu---u negative infinity 1 2047 (max) .000000---0 positive infinity 0 2047 (max) .000000---0 negative zero 1 0 .000000---0 positive zero 0 0 .000000---0 */ static HReg iselDblExpr ( ISelEnv* env, const IRExpr* e ) { HReg r = iselDblExpr_wrk( env, e ); # if 0 vex_printf("\n"); ppIRExpr(e); vex_printf("\n"); # endif vassert(hregClass(r) == HRcVec128); vassert(hregIsVirtual(r)); return r; } /* DO NOT CALL THIS DIRECTLY */ static HReg iselDblExpr_wrk ( ISelEnv* env, const IRExpr* e ) { IRType ty = typeOfIRExpr(env->type_env,e); vassert(e); vassert(ty == Ity_F64); if (e->tag == Iex_RdTmp) { return lookupIRTemp(env, e->Iex.RdTmp.tmp); } if (e->tag == Iex_Const) { union { ULong u64; Double f64; } u; HReg res = newVRegV(env); HReg tmp = newVRegI(env); vassert(sizeof(u) == 8); vassert(sizeof(u.u64) == 8); vassert(sizeof(u.f64) == 8); if (e->Iex.Const.con->tag == Ico_F64) { u.f64 = e->Iex.Const.con->Ico.F64; } else if (e->Iex.Const.con->tag == Ico_F64i) { u.u64 = e->Iex.Const.con->Ico.F64i; } else vpanic("iselDblExpr(amd64): const"); addInstr(env, AMD64Instr_Imm64(u.u64, tmp)); addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp))); addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, AMD64AMode_IR(0, hregAMD64_RSP()) )); add_to_rsp(env, 8); return res; } if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) { AMD64AMode* am; HReg res = newVRegV(env); vassert(e->Iex.Load.ty == Ity_F64); am = iselIntExpr_AMode(env, e->Iex.Load.addr); addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am )); return res; } if (e->tag == Iex_Get) { AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset, hregAMD64_RBP() ); HReg res = newVRegV(env); addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am )); return res; } if (e->tag == Iex_GetI) { AMD64AMode* am = genGuestArrayOffset( env, e->Iex.GetI.descr, e->Iex.GetI.ix, e->Iex.GetI.bias ); HReg res = newVRegV(env); addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am )); return res; } if (e->tag == Iex_Triop) { IRTriop *triop = e->Iex.Triop.details; AMD64SseOp op = Asse_INVALID; switch (triop->op) { case Iop_AddF64: op = Asse_ADDF; break; case Iop_SubF64: op = Asse_SUBF; break; case Iop_MulF64: op = Asse_MULF; break; case Iop_DivF64: op = Asse_DIVF; break; default: break; } if (op != Asse_INVALID) { HReg dst = newVRegV(env); HReg argL = iselDblExpr(env, triop->arg2); HReg argR = iselDblExpr(env, triop->arg3); addInstr(env, mk_vMOVsd_RR(argL, dst)); /* XXXROUNDINGFIXME */ /* set roundingmode here */ addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst)); return dst; } } if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_MAddF64) { IRQop *qop = e->Iex.Qop.details; HReg dst = newVRegV(env); HReg argX = iselDblExpr(env, qop->arg2); HReg argY = iselDblExpr(env, qop->arg3); HReg argZ = iselDblExpr(env, qop->arg4); /* XXXROUNDINGFIXME */ /* set roundingmode here */ /* subq $32, %rsp -- make a space*/ sub_from_rsp(env, 32); /* Prepare 4 arg regs: leaq 0(%rsp), %rdi leaq 8(%rsp), %rsi leaq 16(%rsp), %rdx leaq 24(%rsp), %rcx */ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, hregAMD64_RSP()), hregAMD64_RDI())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(8, hregAMD64_RSP()), hregAMD64_RSI())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, hregAMD64_RSP()), hregAMD64_RDX())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(24, hregAMD64_RSP()), hregAMD64_RCX())); /* Store the three args, at (%rsi), (%rdx) and (%rcx): movsd %argX, 0(%rsi) movsd %argY, 0(%rdx) movsd %argZ, 0(%rcx) */ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argX, AMD64AMode_IR(0, hregAMD64_RSI()))); addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argY, AMD64AMode_IR(0, hregAMD64_RDX()))); addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 8, argZ, AMD64AMode_IR(0, hregAMD64_RCX()))); /* call the helper */ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)(HWord)h_generic_calc_MAddF64, 4, mk_RetLoc_simple(RLPri_None) )); /* fetch the result from memory, using %r_argp, which the register allocator will keep alive across the call. */ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 8, dst, AMD64AMode_IR(0, hregAMD64_RSP()))); /* and finally, clear the space */ add_to_rsp(env, 32); return dst; } if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) { AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP()); HReg arg = iselDblExpr(env, e->Iex.Binop.arg2); HReg dst = newVRegV(env); /* rf now holds the value to be rounded. The first thing to do is set the FPU's rounding mode accordingly. */ /* Set host x87 rounding mode */ set_FPU_rounding_mode( env, e->Iex.Binop.arg1 ); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp)); addInstr(env, AMD64Instr_A87Free(1)); addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8)); addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND)); addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp)); /* Restore default x87 rounding. */ set_FPU_rounding_default( env ); return dst; } IRTriop *triop = e->Iex.Triop.details; if (e->tag == Iex_Triop && (triop->op == Iop_ScaleF64 || triop->op == Iop_AtanF64 || triop->op == Iop_Yl2xF64 || triop->op == Iop_Yl2xp1F64 || triop->op == Iop_PRemF64 || triop->op == Iop_PRem1F64) ) { AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP()); HReg arg1 = iselDblExpr(env, triop->arg2); HReg arg2 = iselDblExpr(env, triop->arg3); HReg dst = newVRegV(env); Bool arg2first = toBool(triop->op == Iop_ScaleF64 || triop->op == Iop_PRemF64 || triop->op == Iop_PRem1F64); addInstr(env, AMD64Instr_A87Free(2)); /* one arg -> top of x87 stack */ addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp)); addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8)); /* other arg -> top of x87 stack */ addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp)); addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8)); /* do it */ /* XXXROUNDINGFIXME */ /* set roundingmode here */ switch (triop->op) { case Iop_ScaleF64: addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE)); break; case Iop_AtanF64: addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN)); break; case Iop_Yl2xF64: addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X)); break; case Iop_Yl2xp1F64: addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1)); break; case Iop_PRemF64: addInstr(env, AMD64Instr_A87FpOp(Afp_PREM)); break; case Iop_PRem1F64: addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1)); break; default: vassert(0); } /* save result */ addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp)); return dst; } if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) { HReg dst = newVRegV(env); HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2); set_SSE_rounding_mode( env, e->Iex.Binop.arg1 ); addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst )); set_SSE_rounding_default( env ); return dst; } if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) { HReg dst = newVRegV(env); HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); set_SSE_rounding_default( env ); addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst )); return dst; } if (e->tag == Iex_Unop && (e->Iex.Unop.op == Iop_NegF64 || e->Iex.Unop.op == Iop_AbsF64)) { /* Sigh ... very rough code. Could do much better. */ /* Get the 128-bit literal 00---0 10---0 into a register and xor/nand it with the value to be negated. */ HReg r1 = newVRegI(env); HReg dst = newVRegV(env); HReg tmp = newVRegV(env); HReg src = iselDblExpr(env, e->Iex.Unop.arg); AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); addInstr(env, mk_vMOVsd_RR(src,tmp)); addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0))); addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 )); addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1))); addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0)); if (e->Iex.Unop.op == Iop_NegF64) addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst)); else addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst)); add_to_rsp(env, 16); return dst; } if (e->tag == Iex_Binop) { A87FpOp fpop = Afp_INVALID; switch (e->Iex.Binop.op) { case Iop_SqrtF64: fpop = Afp_SQRT; break; case Iop_SinF64: fpop = Afp_SIN; break; case Iop_CosF64: fpop = Afp_COS; break; case Iop_TanF64: fpop = Afp_TAN; break; case Iop_2xm1F64: fpop = Afp_2XM1; break; default: break; } if (fpop != Afp_INVALID) { AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP()); HReg arg = iselDblExpr(env, e->Iex.Binop.arg2); HReg dst = newVRegV(env); Int nNeeded = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1; addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp)); addInstr(env, AMD64Instr_A87Free(nNeeded)); addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8)); /* XXXROUNDINGFIXME */ /* set roundingmode here */ /* Note that AMD64Instr_A87FpOp(Afp_TAN) sets the condition codes. I don't think that matters, since this insn selector never generates such an instruction intervening between an flag-setting instruction and a flag-using instruction. */ addInstr(env, AMD64Instr_A87FpOp(fpop)); addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp)); return dst; } } if (e->tag == Iex_Unop) { switch (e->Iex.Unop.op) { //.. case Iop_I32toF64: { //.. HReg dst = newVRegF(env); //.. HReg ri = iselIntExpr_R(env, e->Iex.Unop.arg); //.. addInstr(env, X86Instr_Push(X86RMI_Reg(ri))); //.. set_FPU_rounding_default(env); //.. addInstr(env, X86Instr_FpLdStI( //.. True/*load*/, 4, dst, //.. X86AMode_IR(0, hregX86_ESP()))); //.. add_to_esp(env, 4); //.. return dst; //.. } case Iop_ReinterpI64asF64: { /* Given an I64, produce an IEEE754 double with the same bit pattern. */ AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP()); HReg dst = newVRegV(env); AMD64RI* src = iselIntExpr_RI(env, e->Iex.Unop.arg); /* paranoia */ set_SSE_rounding_default(env); addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp)); return dst; } case Iop_F32toF64: { HReg f32; HReg f64 = newVRegV(env); /* this shouldn't be necessary, but be paranoid ... */ set_SSE_rounding_default(env); f32 = iselFltExpr(env, e->Iex.Unop.arg); addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64)); return f64; } default: break; } } /* --------- MULTIPLEX --------- */ if (e->tag == Iex_ITE) { // VFD HReg r1, r0, dst; vassert(ty == Ity_F64); vassert(typeOfIRExpr(env->type_env,e->Iex.ITE.cond) == Ity_I1); r1 = iselDblExpr(env, e->Iex.ITE.iftrue); r0 = iselDblExpr(env, e->Iex.ITE.iffalse); dst = newVRegV(env); addInstr(env, mk_vMOVsd_RR(r1,dst)); AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond); addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst)); return dst; } ppIRExpr(e); vpanic("iselDblExpr_wrk"); } /*---------------------------------------------------------*/ /*--- ISEL: SIMD (Vector) expressions, 128 bit. ---*/ /*---------------------------------------------------------*/ static HReg iselVecExpr ( ISelEnv* env, const IRExpr* e ) { HReg r = iselVecExpr_wrk( env, e ); # if 0 vex_printf("\n"); ppIRExpr(e); vex_printf("\n"); # endif vassert(hregClass(r) == HRcVec128); vassert(hregIsVirtual(r)); return r; } /* DO NOT CALL THIS DIRECTLY */ static HReg iselVecExpr_wrk ( ISelEnv* env, const IRExpr* e ) { HWord fn = 0; /* address of helper fn, if required */ Bool arg1isEReg = False; AMD64SseOp op = Asse_INVALID; vassert(e); IRType ty = typeOfIRExpr(env->type_env, e); vassert(ty == Ity_V128); UInt laneBits = 0; if (e->tag == Iex_RdTmp) { return lookupIRTemp(env, e->Iex.RdTmp.tmp); } if (e->tag == Iex_Get) { HReg dst = newVRegV(env); addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP()) ) ); return dst; } if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) { HReg dst = newVRegV(env); AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr); addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am )); return dst; } if (e->tag == Iex_Const) { HReg dst = newVRegV(env); vassert(e->Iex.Const.con->tag == Ico_V128); switch (e->Iex.Const.con->Ico.V128) { case 0x0000: dst = generate_zeroes_V128(env); break; case 0xFFFF: dst = generate_ones_V128(env); break; default: { AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); /* do push_uimm64 twice, first time for the high-order half. */ push_uimm64(env, bitmask8_to_bytemask64( (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF )); push_uimm64(env, bitmask8_to_bytemask64( (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF )); addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 )); add_to_rsp(env, 16); break; } } return dst; } if (e->tag == Iex_Unop) { switch (e->Iex.Unop.op) { case Iop_NotV128: { HReg arg = iselVecExpr(env, e->Iex.Unop.arg); return do_sse_NotV128(env, arg); } case Iop_CmpNEZ64x2: { /* We can use SSE2 instructions for this. */ /* Ideally, we want to do a 64Ix2 comparison against zero of the operand. Problem is no such insn exists. Solution therefore is to do a 32Ix4 comparison instead, and bitwise- negate (NOT) the result. Let a,b,c,d be 32-bit lanes, and let the not'd result of this initial comparison be a:b:c:d. What we need to compute is (a|b):(a|b):(c|d):(c|d). So, use pshufd to create a value b:a:d:c, and OR that with a:b:c:d, giving the required result. The required selection sequence is 2,3,0,1, which according to Intel's documentation means the pshufd literal value is 0xB1, that is, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0) */ HReg arg = iselVecExpr(env, e->Iex.Unop.arg); HReg tmp = generate_zeroes_V128(env); HReg dst = newVRegV(env); addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp)); tmp = do_sse_NotV128(env, tmp); addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst)); addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst)); return dst; } case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector; case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector; case Iop_CmpNEZ8x16: op = Asse_CMPEQ8; goto do_CmpNEZ_vector; do_CmpNEZ_vector: { HReg arg = iselVecExpr(env, e->Iex.Unop.arg); HReg tmp = newVRegV(env); HReg zero = generate_zeroes_V128(env); HReg dst; addInstr(env, mk_vMOVsd_RR(arg, tmp)); addInstr(env, AMD64Instr_SseReRg(op, zero, tmp)); dst = do_sse_NotV128(env, tmp); return dst; } case Iop_RecipEst32Fx4: op = Asse_RCPF; goto do_32Fx4_unary; case Iop_RSqrtEst32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary; do_32Fx4_unary: { HReg arg = iselVecExpr(env, e->Iex.Unop.arg); HReg dst = newVRegV(env); addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst)); return dst; } case Iop_RecipEst32F0x4: op = Asse_RCPF; goto do_32F0x4_unary; case Iop_RSqrtEst32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary; case Iop_Sqrt32F0x4: op = Asse_SQRTF; goto do_32F0x4_unary; do_32F0x4_unary: { /* A bit subtle. We have to copy the arg to the result register first, because actually doing the SSE scalar insn leaves the upper 3/4 of the destination register unchanged. Whereas the required semantics of these primops is that the upper 3/4 is simply copied in from the argument. */ HReg arg = iselVecExpr(env, e->Iex.Unop.arg); HReg dst = newVRegV(env); addInstr(env, mk_vMOVsd_RR(arg, dst)); addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst)); return dst; } case Iop_Sqrt64F0x2: op = Asse_SQRTF; goto do_64F0x2_unary; do_64F0x2_unary: { /* A bit subtle. We have to copy the arg to the result register first, because actually doing the SSE scalar insn leaves the upper half of the destination register unchanged. Whereas the required semantics of these primops is that the upper half is simply copied in from the argument. */ HReg arg = iselVecExpr(env, e->Iex.Unop.arg); HReg dst = newVRegV(env); addInstr(env, mk_vMOVsd_RR(arg, dst)); addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst)); return dst; } case Iop_32UtoV128: { // FIXME maybe just use MOVQ here? HReg dst = newVRegV(env); AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP()); AMD64RI* ri = iselIntExpr_RI(env, e->Iex.Unop.arg); addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32)); addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32)); return dst; } case Iop_64UtoV128: { // FIXME maybe just use MOVQ here? HReg dst = newVRegV(env); AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg); addInstr(env, AMD64Instr_Push(rmi)); addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0)); add_to_rsp(env, 8); return dst; } case Iop_V256toV128_0: case Iop_V256toV128_1: { HReg vHi, vLo; iselDVecExpr(&vHi, &vLo, env, e->Iex.Unop.arg); return (e->Iex.Unop.op == Iop_V256toV128_1) ? vHi : vLo; } case Iop_F16toF32x4: { if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) { HReg src = iselIntExpr_R(env, e->Iex.Unop.arg); HReg dst = newVRegV(env); addInstr(env, AMD64Instr_SseMOVQ(src, dst, /*toXMM=*/True)); addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, dst, dst)); return dst; } break; } default: break; } /* switch (e->Iex.Unop.op) */ } /* if (e->tag == Iex_Unop) */ if (e->tag == Iex_Binop) { switch (e->Iex.Binop.op) { case Iop_Sqrt64Fx2: case Iop_Sqrt32Fx4: { /* :: (rmode, vec) -> vec */ HReg arg = iselVecExpr(env, e->Iex.Binop.arg2); HReg dst = newVRegV(env); /* XXXROUNDINGFIXME */ /* set roundingmode here */ addInstr(env, (e->Iex.Binop.op == Iop_Sqrt64Fx2 ? AMD64Instr_Sse64Fx2 : AMD64Instr_Sse32Fx4) (Asse_SQRTF, arg, dst)); return dst; } /* FIXME: could we generate MOVQ here? */ case Iop_SetV128lo64: { HReg dst = newVRegV(env); HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1); HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2); AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP()); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16)); addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16)); return dst; } /* FIXME: could we generate MOVD here? */ case Iop_SetV128lo32: { HReg dst = newVRegV(env); HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1); HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2); AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP()); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16)); addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16)); return dst; } case Iop_64HLtoV128: { const IRExpr* arg1 = e->Iex.Binop.arg1; const IRExpr* arg2 = e->Iex.Binop.arg2; HReg dst = newVRegV(env); HReg tmp = newVRegV(env); HReg qHi = iselIntExpr_R(env, arg1); // If the args are trivially the same (tmp or const), use the same // source register for both, and only one movq since those are // (relatively) expensive. if (areAtomsAndEqual(arg1, arg2)) { addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/)); addInstr(env, mk_vMOVsd_RR(dst, tmp)); addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst)); addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst)); } else { HReg qLo = iselIntExpr_R(env, arg2); addInstr(env, AMD64Instr_SseMOVQ(qHi, dst, True/*toXMM*/)); addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst)); addInstr(env, AMD64Instr_SseMOVQ(qLo, tmp, True/*toXMM*/)); addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst)); } return dst; } case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4; case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4; case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4; case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4; case Iop_Max32Fx4: op = Asse_MAXF; goto do_32Fx4; case Iop_Min32Fx4: op = Asse_MINF; goto do_32Fx4; do_32Fx4: { HReg argL = iselVecExpr(env, e->Iex.Binop.arg1); HReg argR = iselVecExpr(env, e->Iex.Binop.arg2); HReg dst = newVRegV(env); addInstr(env, mk_vMOVsd_RR(argL, dst)); addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst)); return dst; } case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2; case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2; case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2; case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2; case Iop_Max64Fx2: op = Asse_MAXF; goto do_64Fx2; case Iop_Min64Fx2: op = Asse_MINF; goto do_64Fx2; do_64Fx2: { HReg argL = iselVecExpr(env, e->Iex.Binop.arg1); HReg argR = iselVecExpr(env, e->Iex.Binop.arg2); HReg dst = newVRegV(env); addInstr(env, mk_vMOVsd_RR(argL, dst)); addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst)); return dst; } case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4; case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4; case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4; case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4; case Iop_Add32F0x4: op = Asse_ADDF; goto do_32F0x4; case Iop_Div32F0x4: op = Asse_DIVF; goto do_32F0x4; case Iop_Max32F0x4: op = Asse_MAXF; goto do_32F0x4; case Iop_Min32F0x4: op = Asse_MINF; goto do_32F0x4; case Iop_Mul32F0x4: op = Asse_MULF; goto do_32F0x4; case Iop_Sub32F0x4: op = Asse_SUBF; goto do_32F0x4; do_32F0x4: { HReg argL = iselVecExpr(env, e->Iex.Binop.arg1); HReg argR = iselVecExpr(env, e->Iex.Binop.arg2); HReg dst = newVRegV(env); addInstr(env, mk_vMOVsd_RR(argL, dst)); addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst)); return dst; } case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2; case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2; case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2; case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2; case Iop_Add64F0x2: op = Asse_ADDF; goto do_64F0x2; case Iop_Div64F0x2: op = Asse_DIVF; goto do_64F0x2; case Iop_Max64F0x2: op = Asse_MAXF; goto do_64F0x2; case Iop_Min64F0x2: op = Asse_MINF; goto do_64F0x2; case Iop_Mul64F0x2: op = Asse_MULF; goto do_64F0x2; case Iop_Sub64F0x2: op = Asse_SUBF; goto do_64F0x2; do_64F0x2: { HReg argL = iselVecExpr(env, e->Iex.Binop.arg1); HReg argR = iselVecExpr(env, e->Iex.Binop.arg2); HReg dst = newVRegV(env); addInstr(env, mk_vMOVsd_RR(argL, dst)); addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst)); return dst; } case Iop_PermOrZero8x16: if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) { op = Asse_PSHUFB; goto do_SseReRg; } // Otherwise we'll have to generate a call to // h_generic_calc_PermOrZero8x16 (ATK). But that would only be for a // host which doesn't have SSSE3, in which case we don't expect this // IROp to enter the compilation pipeline in the first place. break; case Iop_PwExtUSMulQAdd8x16: if (env->hwcaps & VEX_HWCAPS_AMD64_SSSE3) { op = Asse_PMADDUBSW; goto do_SseReRg; } break; case Iop_QNarrowBin32Sto16Sx8: op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg; case Iop_QNarrowBin16Sto8Sx16: op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg; case Iop_QNarrowBin16Sto8Ux16: op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg; case Iop_InterleaveHI8x16: op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg; case Iop_InterleaveHI16x8: op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg; case Iop_InterleaveHI32x4: op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg; case Iop_InterleaveHI64x2: op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg; case Iop_InterleaveLO8x16: op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg; case Iop_InterleaveLO16x8: op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg; case Iop_InterleaveLO32x4: op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg; case Iop_InterleaveLO64x2: op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg; case Iop_AndV128: op = Asse_AND; goto do_SseReRg; case Iop_OrV128: op = Asse_OR; goto do_SseReRg; case Iop_XorV128: op = Asse_XOR; goto do_SseReRg; case Iop_Add8x16: op = Asse_ADD8; goto do_SseReRg; case Iop_Add16x8: op = Asse_ADD16; goto do_SseReRg; case Iop_Add32x4: op = Asse_ADD32; goto do_SseReRg; case Iop_Add64x2: op = Asse_ADD64; goto do_SseReRg; case Iop_QAdd8Sx16: op = Asse_QADD8S; goto do_SseReRg; case Iop_QAdd16Sx8: op = Asse_QADD16S; goto do_SseReRg; case Iop_QAdd8Ux16: op = Asse_QADD8U; goto do_SseReRg; case Iop_QAdd16Ux8: op = Asse_QADD16U; goto do_SseReRg; case Iop_Avg8Ux16: op = Asse_AVG8U; goto do_SseReRg; case Iop_Avg16Ux8: op = Asse_AVG16U; goto do_SseReRg; case Iop_CmpEQ8x16: op = Asse_CMPEQ8; goto do_SseReRg; case Iop_CmpEQ16x8: op = Asse_CMPEQ16; goto do_SseReRg; case Iop_CmpEQ32x4: op = Asse_CMPEQ32; goto do_SseReRg; case Iop_CmpGT8Sx16: op = Asse_CMPGT8S; goto do_SseReRg; case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg; case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg; case Iop_Max16Sx8: op = Asse_MAX16S; goto do_SseReRg; case Iop_Max8Ux16: op = Asse_MAX8U; goto do_SseReRg; case Iop_Min16Sx8: op = Asse_MIN16S; goto do_SseReRg; case Iop_Min8Ux16: op = Asse_MIN8U; goto do_SseReRg; case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg; case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg; case Iop_Mul16x8: op = Asse_MUL16; goto do_SseReRg; case Iop_Sub8x16: op = Asse_SUB8; goto do_SseReRg; case Iop_Sub16x8: op = Asse_SUB16; goto do_SseReRg; case Iop_Sub32x4: op = Asse_SUB32; goto do_SseReRg; case Iop_Sub64x2: op = Asse_SUB64; goto do_SseReRg; case Iop_QSub8Sx16: op = Asse_QSUB8S; goto do_SseReRg; case Iop_QSub16Sx8: op = Asse_QSUB16S; goto do_SseReRg; case Iop_QSub8Ux16: op = Asse_QSUB8U; goto do_SseReRg; case Iop_QSub16Ux8: op = Asse_QSUB16U; goto do_SseReRg; do_SseReRg: { HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1); HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2); HReg dst = newVRegV(env); if (arg1isEReg) { addInstr(env, mk_vMOVsd_RR(arg2, dst)); addInstr(env, AMD64Instr_SseReRg(op, arg1, dst)); } else { addInstr(env, mk_vMOVsd_RR(arg1, dst)); addInstr(env, AMD64Instr_SseReRg(op, arg2, dst)); } return dst; } case Iop_ShlN16x8: laneBits = 16; op = Asse_SHL16; goto do_SseShift; case Iop_ShlN32x4: laneBits = 32; op = Asse_SHL32; goto do_SseShift; case Iop_ShlN64x2: laneBits = 64; op = Asse_SHL64; goto do_SseShift; case Iop_SarN16x8: laneBits = 16; op = Asse_SAR16; goto do_SseShift; case Iop_SarN32x4: laneBits = 32; op = Asse_SAR32; goto do_SseShift; case Iop_ShrN16x8: laneBits = 16; op = Asse_SHR16; goto do_SseShift; case Iop_ShrN32x4: laneBits = 32; op = Asse_SHR32; goto do_SseShift; case Iop_ShrN64x2: laneBits = 64; op = Asse_SHR64; goto do_SseShift; do_SseShift: { HReg dst = newVRegV(env); HReg greg = iselVecExpr(env, e->Iex.Binop.arg1); /* If it's a shift by an in-range immediate, generate a single instruction. */ if (e->Iex.Binop.arg2->tag == Iex_Const) { IRConst* c = e->Iex.Binop.arg2->Iex.Const.con; vassert(c->tag == Ico_U8); UInt shift = c->Ico.U8; if (shift < laneBits) { addInstr(env, mk_vMOVsd_RR(greg, dst)); addInstr(env, AMD64Instr_SseShiftN(op, shift, dst)); return dst; } } /* Otherwise we have to do it the longwinded way. */ AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2); AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); HReg ereg = newVRegV(env); addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0))); addInstr(env, AMD64Instr_Push(rmi)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0)); addInstr(env, mk_vMOVsd_RR(greg, dst)); addInstr(env, AMD64Instr_SseReRg(op, ereg, dst)); add_to_rsp(env, 16); return dst; } case Iop_Mul32x4: fn = (HWord)h_generic_calc_Mul32x4; goto do_SseAssistedBinary; case Iop_Max32Sx4: fn = (HWord)h_generic_calc_Max32Sx4; goto do_SseAssistedBinary; case Iop_Min32Sx4: fn = (HWord)h_generic_calc_Min32Sx4; goto do_SseAssistedBinary; case Iop_Max32Ux4: fn = (HWord)h_generic_calc_Max32Ux4; goto do_SseAssistedBinary; case Iop_Min32Ux4: fn = (HWord)h_generic_calc_Min32Ux4; goto do_SseAssistedBinary; case Iop_Max16Ux8: fn = (HWord)h_generic_calc_Max16Ux8; goto do_SseAssistedBinary; case Iop_Min16Ux8: fn = (HWord)h_generic_calc_Min16Ux8; goto do_SseAssistedBinary; case Iop_Max8Sx16: fn = (HWord)h_generic_calc_Max8Sx16; goto do_SseAssistedBinary; case Iop_Min8Sx16: fn = (HWord)h_generic_calc_Min8Sx16; goto do_SseAssistedBinary; case Iop_CmpEQ64x2: fn = (HWord)h_generic_calc_CmpEQ64x2; goto do_SseAssistedBinary; case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2; goto do_SseAssistedBinary; case Iop_Perm32x4: fn = (HWord)h_generic_calc_Perm32x4; goto do_SseAssistedBinary; case Iop_QNarrowBin32Sto16Ux8: fn = (HWord)h_generic_calc_QNarrowBin32Sto16Ux8; goto do_SseAssistedBinary; case Iop_NarrowBin16to8x16: fn = (HWord)h_generic_calc_NarrowBin16to8x16; goto do_SseAssistedBinary; case Iop_NarrowBin32to16x8: fn = (HWord)h_generic_calc_NarrowBin32to16x8; goto do_SseAssistedBinary; do_SseAssistedBinary: { /* RRRufff! RRRufff code is what we're generating here. Oh well. */ vassert(fn != 0); HReg dst = newVRegV(env); HReg argL = iselVecExpr(env, e->Iex.Binop.arg1); HReg argR = iselVecExpr(env, e->Iex.Binop.arg2); HReg argp = newVRegI(env); /* subq $112, %rsp -- make a space*/ sub_from_rsp(env, 112); /* leaq 48(%rsp), %r_argp -- point into it */ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()), argp)); /* andq $-16, %r_argp -- 16-align the pointer */ addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm( ~(UInt)15 ), argp)); /* Prepare 3 arg regs: leaq 0(%r_argp), %rdi leaq 16(%r_argp), %rsi leaq 32(%r_argp), %rdx */ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp), hregAMD64_RDI())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp), hregAMD64_RSI())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp), hregAMD64_RDX())); /* Store the two args, at (%rsi) and (%rdx): movupd %argL, 0(%rsi) movupd %argR, 0(%rdx) */ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL, AMD64AMode_IR(0, hregAMD64_RSI()))); addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR, AMD64AMode_IR(0, hregAMD64_RDX()))); /* call the helper */ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, mk_RetLoc_simple(RLPri_None) )); /* fetch the result from memory, using %r_argp, which the register allocator will keep alive across the call. */ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst, AMD64AMode_IR(0, argp))); /* and finally, clear the space */ add_to_rsp(env, 112); return dst; } case Iop_SarN64x2: fn = (HWord)h_generic_calc_SarN64x2; goto do_SseAssistedVectorAndScalar; case Iop_SarN8x16: fn = (HWord)h_generic_calc_SarN8x16; goto do_SseAssistedVectorAndScalar; do_SseAssistedVectorAndScalar: { /* RRRufff! RRRufff code is what we're generating here. Oh well. */ vassert(fn != 0); HReg dst = newVRegV(env); HReg argL = iselVecExpr(env, e->Iex.Binop.arg1); HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2); HReg argp = newVRegI(env); /* subq $112, %rsp -- make a space*/ sub_from_rsp(env, 112); /* leaq 48(%rsp), %r_argp -- point into it */ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()), argp)); /* andq $-16, %r_argp -- 16-align the pointer */ addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm( ~(UInt)15 ), argp)); /* Prepare 2 vector arg regs: leaq 0(%r_argp), %rdi leaq 16(%r_argp), %rsi */ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp), hregAMD64_RDI())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp), hregAMD64_RSI())); /* Store the vector arg, at (%rsi): movupd %argL, 0(%rsi) */ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL, AMD64AMode_IR(0, hregAMD64_RSI()))); /* And get the scalar value into rdx */ addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RDX())); /* call the helper */ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, mk_RetLoc_simple(RLPri_None) )); /* fetch the result from memory, using %r_argp, which the register allocator will keep alive across the call. */ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst, AMD64AMode_IR(0, argp))); /* and finally, clear the space */ add_to_rsp(env, 112); return dst; } case Iop_I32StoF32x4: case Iop_F32toI32Sx4: { HReg arg = iselVecExpr(env, e->Iex.Binop.arg2); HReg dst = newVRegV(env); AMD64SseOp mop = e->Iex.Binop.op == Iop_I32StoF32x4 ? Asse_I2F : Asse_F2I; set_SSE_rounding_mode(env, e->Iex.Binop.arg1); addInstr(env, AMD64Instr_Sse32Fx4(mop, arg, dst)); set_SSE_rounding_default(env); return dst; } // Half-float vector conversion case Iop_F32toF16x8: { if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) { HReg srcHi, srcLo; iselDVecExpr(&srcHi, &srcLo, env, e->Iex.Binop.arg2); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); set_SSE_rounding_mode( env, e->Iex.Binop.arg1 ); addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcHi, dstHi)); addInstr(env, AMD64Instr_Sse32Fx4(Asse_F32toF16, srcLo, dstLo)); set_SSE_rounding_default(env); // Now we have the result in dstHi[63:0] and dstLo[63:0], but we // need to compact all that into one register. There's probably a // more elegant way to do this, but .. addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi)); // dstHi is now 127:64 = useful data, 63:0 = zero addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo)); addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, dstLo)); // dstLo is now 127:64 = zero, 63:0 = useful data addInstr(env, AMD64Instr_SseReRg(Asse_OR, dstHi, dstLo)); return dstLo; } break; } default: break; } /* switch (e->Iex.Binop.op) */ } /* if (e->tag == Iex_Binop) */ if (e->tag == Iex_Triop) { IRTriop *triop = e->Iex.Triop.details; switch (triop->op) { case Iop_Add64Fx2: op = Asse_ADDF; goto do_64Fx2_w_rm; case Iop_Sub64Fx2: op = Asse_SUBF; goto do_64Fx2_w_rm; case Iop_Mul64Fx2: op = Asse_MULF; goto do_64Fx2_w_rm; case Iop_Div64Fx2: op = Asse_DIVF; goto do_64Fx2_w_rm; do_64Fx2_w_rm: { HReg argL = iselVecExpr(env, triop->arg2); HReg argR = iselVecExpr(env, triop->arg3); HReg dst = newVRegV(env); addInstr(env, mk_vMOVsd_RR(argL, dst)); /* XXXROUNDINGFIXME */ /* set roundingmode here */ addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst)); return dst; } case Iop_Add32Fx4: op = Asse_ADDF; goto do_32Fx4_w_rm; case Iop_Sub32Fx4: op = Asse_SUBF; goto do_32Fx4_w_rm; case Iop_Mul32Fx4: op = Asse_MULF; goto do_32Fx4_w_rm; case Iop_Div32Fx4: op = Asse_DIVF; goto do_32Fx4_w_rm; do_32Fx4_w_rm: { HReg argL = iselVecExpr(env, triop->arg2); HReg argR = iselVecExpr(env, triop->arg3); HReg dst = newVRegV(env); addInstr(env, mk_vMOVsd_RR(argL, dst)); /* XXXROUNDINGFIXME */ /* set roundingmode here */ addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst)); return dst; } default: break; } /* switch (triop->op) */ } /* if (e->tag == Iex_Triop) */ if (e->tag == Iex_ITE) { // VFD HReg r1 = iselVecExpr(env, e->Iex.ITE.iftrue); HReg r0 = iselVecExpr(env, e->Iex.ITE.iffalse); HReg dst = newVRegV(env); addInstr(env, mk_vMOVsd_RR(r1,dst)); AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond); addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0, dst)); return dst; } //vec_fail: vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n", LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps)); ppIRExpr(e); vpanic("iselVecExpr_wrk"); } /*---------------------------------------------------------*/ /*--- ISEL: SIMD (V256) expressions, into 2 XMM regs. --*/ /*---------------------------------------------------------*/ static void iselDVecExpr ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, ISelEnv* env, const IRExpr* e ) { iselDVecExpr_wrk( rHi, rLo, env, e ); # if 0 vex_printf("\n"); ppIRExpr(e); vex_printf("\n"); # endif vassert(hregClass(*rHi) == HRcVec128); vassert(hregClass(*rLo) == HRcVec128); vassert(hregIsVirtual(*rHi)); vassert(hregIsVirtual(*rLo)); } /* DO NOT CALL THIS DIRECTLY */ static void iselDVecExpr_wrk ( /*OUT*/HReg* rHi, /*OUT*/HReg* rLo, ISelEnv* env, const IRExpr* e ) { HWord fn = 0; /* address of helper fn, if required */ vassert(e); IRType ty = typeOfIRExpr(env->type_env, e); vassert(ty == Ity_V256); UInt laneBits = 0; AMD64SseOp op = Asse_INVALID; /* read 256-bit IRTemp */ if (e->tag == Iex_RdTmp) { lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp); return; } if (e->tag == Iex_Get) { HReg vHi = newVRegV(env); HReg vLo = newVRegV(env); HReg rbp = hregAMD64_RBP(); AMD64AMode* am0 = AMD64AMode_IR(e->Iex.Get.offset + 0, rbp); AMD64AMode* am16 = AMD64AMode_IR(e->Iex.Get.offset + 16, rbp); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16)); *rHi = vHi; *rLo = vLo; return; } if (e->tag == Iex_Load) { HReg vHi = newVRegV(env); HReg vLo = newVRegV(env); HReg rA = iselIntExpr_R(env, e->Iex.Load.addr); AMD64AMode* am0 = AMD64AMode_IR(0, rA); AMD64AMode* am16 = AMD64AMode_IR(16, rA); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vLo, am0)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, vHi, am16)); *rHi = vHi; *rLo = vLo; return; } if (e->tag == Iex_Const) { vassert(e->Iex.Const.con->tag == Ico_V256); switch (e->Iex.Const.con->Ico.V256) { case 0x00000000: { HReg vHi = generate_zeroes_V128(env); HReg vLo = newVRegV(env); addInstr(env, mk_vMOVsd_RR(vHi, vLo)); *rHi = vHi; *rLo = vLo; return; } case 0xFFFFFFFF: { HReg vHi = generate_ones_V128(env); HReg vLo = newVRegV(env); addInstr(env, mk_vMOVsd_RR(vHi, vLo)); *rHi = vHi; *rLo = vLo; return; } default: break; /* give up. Until such time as is necessary. */ } } if (e->tag == Iex_Unop) { switch (e->Iex.Unop.op) { case Iop_NotV256: { HReg argHi, argLo; iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg); *rHi = do_sse_NotV128(env, argHi); *rLo = do_sse_NotV128(env, argLo); return; } case Iop_RecipEst32Fx8: op = Asse_RCPF; goto do_32Fx8_unary; case Iop_Sqrt32Fx8: op = Asse_SQRTF; goto do_32Fx8_unary; case Iop_RSqrtEst32Fx8: op = Asse_RSQRTF; goto do_32Fx8_unary; do_32Fx8_unary: { HReg argHi, argLo; iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); addInstr(env, AMD64Instr_Sse32Fx4(op, argHi, dstHi)); addInstr(env, AMD64Instr_Sse32Fx4(op, argLo, dstLo)); *rHi = dstHi; *rLo = dstLo; return; } case Iop_Sqrt64Fx4: op = Asse_SQRTF; goto do_64Fx4_unary; do_64Fx4_unary: { HReg argHi, argLo; iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); addInstr(env, AMD64Instr_Sse64Fx2(op, argHi, dstHi)); addInstr(env, AMD64Instr_Sse64Fx2(op, argLo, dstLo)); *rHi = dstHi; *rLo = dstLo; return; } case Iop_CmpNEZ64x4: { /* We can use SSE2 instructions for this. */ /* Same scheme as Iop_CmpNEZ64x2, except twice as wide (obviously). See comment on Iop_CmpNEZ64x2 for explanation of what's going on here. */ HReg argHi, argLo; iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg); HReg tmpHi = generate_zeroes_V128(env); HReg tmpLo = newVRegV(env); addInstr(env, mk_vMOVsd_RR(tmpHi, tmpLo)); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argHi, tmpHi)); addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, argLo, tmpLo)); tmpHi = do_sse_NotV128(env, tmpHi); tmpLo = do_sse_NotV128(env, tmpLo); addInstr(env, AMD64Instr_SseShuf(0xB1, tmpHi, dstHi)); addInstr(env, AMD64Instr_SseShuf(0xB1, tmpLo, dstLo)); addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpHi, dstHi)); addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmpLo, dstLo)); *rHi = dstHi; *rLo = dstLo; return; } case Iop_CmpNEZ32x8: op = Asse_CMPEQ32; goto do_CmpNEZ_vector; case Iop_CmpNEZ16x16: op = Asse_CMPEQ16; goto do_CmpNEZ_vector; case Iop_CmpNEZ8x32: op = Asse_CMPEQ8; goto do_CmpNEZ_vector; do_CmpNEZ_vector: { HReg argHi, argLo; iselDVecExpr(&argHi, &argLo, env, e->Iex.Unop.arg); HReg tmpHi = newVRegV(env); HReg tmpLo = newVRegV(env); HReg zero = generate_zeroes_V128(env); HReg dstHi, dstLo; addInstr(env, mk_vMOVsd_RR(argHi, tmpHi)); addInstr(env, mk_vMOVsd_RR(argLo, tmpLo)); addInstr(env, AMD64Instr_SseReRg(op, zero, tmpHi)); addInstr(env, AMD64Instr_SseReRg(op, zero, tmpLo)); dstHi = do_sse_NotV128(env, tmpHi); dstLo = do_sse_NotV128(env, tmpLo); *rHi = dstHi; *rLo = dstLo; return; } case Iop_F16toF32x8: { if (env->hwcaps & VEX_HWCAPS_AMD64_F16C) { HReg src = iselVecExpr(env, e->Iex.Unop.arg); HReg srcCopy = newVRegV(env); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); // Copy src, since we'll need to modify it. addInstr(env, mk_vMOVsd_RR(src, srcCopy)); addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstLo)); addInstr(env, AMD64Instr_SseShiftN(Asse_SHR128, 64, srcCopy)); addInstr(env, AMD64Instr_Sse32Fx4(Asse_F16toF32, srcCopy, dstHi)); *rHi = dstHi; *rLo = dstLo; return; } break; } default: break; } /* switch (e->Iex.Unop.op) */ } /* if (e->tag == Iex_Unop) */ if (e->tag == Iex_Binop) { switch (e->Iex.Binop.op) { case Iop_Max64Fx4: op = Asse_MAXF; goto do_64Fx4; case Iop_Min64Fx4: op = Asse_MINF; goto do_64Fx4; do_64Fx4: { HReg argLhi, argLlo, argRhi, argRlo; iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1); iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); addInstr(env, mk_vMOVsd_RR(argLhi, dstHi)); addInstr(env, mk_vMOVsd_RR(argLlo, dstLo)); addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi)); addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo)); *rHi = dstHi; *rLo = dstLo; return; } case Iop_Max32Fx8: op = Asse_MAXF; goto do_32Fx8; case Iop_Min32Fx8: op = Asse_MINF; goto do_32Fx8; do_32Fx8: { HReg argLhi, argLlo, argRhi, argRlo; iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1); iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); addInstr(env, mk_vMOVsd_RR(argLhi, dstHi)); addInstr(env, mk_vMOVsd_RR(argLlo, dstLo)); addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi)); addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo)); *rHi = dstHi; *rLo = dstLo; return; } case Iop_AndV256: op = Asse_AND; goto do_SseReRg; case Iop_OrV256: op = Asse_OR; goto do_SseReRg; case Iop_XorV256: op = Asse_XOR; goto do_SseReRg; case Iop_Add8x32: op = Asse_ADD8; goto do_SseReRg; case Iop_Add16x16: op = Asse_ADD16; goto do_SseReRg; case Iop_Add32x8: op = Asse_ADD32; goto do_SseReRg; case Iop_Add64x4: op = Asse_ADD64; goto do_SseReRg; case Iop_QAdd8Sx32: op = Asse_QADD8S; goto do_SseReRg; case Iop_QAdd16Sx16: op = Asse_QADD16S; goto do_SseReRg; case Iop_QAdd8Ux32: op = Asse_QADD8U; goto do_SseReRg; case Iop_QAdd16Ux16: op = Asse_QADD16U; goto do_SseReRg; case Iop_Avg8Ux32: op = Asse_AVG8U; goto do_SseReRg; case Iop_Avg16Ux16: op = Asse_AVG16U; goto do_SseReRg; case Iop_CmpEQ8x32: op = Asse_CMPEQ8; goto do_SseReRg; case Iop_CmpEQ16x16: op = Asse_CMPEQ16; goto do_SseReRg; case Iop_CmpEQ32x8: op = Asse_CMPEQ32; goto do_SseReRg; case Iop_CmpGT8Sx32: op = Asse_CMPGT8S; goto do_SseReRg; case Iop_CmpGT16Sx16: op = Asse_CMPGT16S; goto do_SseReRg; case Iop_CmpGT32Sx8: op = Asse_CMPGT32S; goto do_SseReRg; case Iop_Max16Sx16: op = Asse_MAX16S; goto do_SseReRg; case Iop_Max8Ux32: op = Asse_MAX8U; goto do_SseReRg; case Iop_Min16Sx16: op = Asse_MIN16S; goto do_SseReRg; case Iop_Min8Ux32: op = Asse_MIN8U; goto do_SseReRg; case Iop_MulHi16Ux16: op = Asse_MULHI16U; goto do_SseReRg; case Iop_MulHi16Sx16: op = Asse_MULHI16S; goto do_SseReRg; case Iop_Mul16x16: op = Asse_MUL16; goto do_SseReRg; case Iop_Sub8x32: op = Asse_SUB8; goto do_SseReRg; case Iop_Sub16x16: op = Asse_SUB16; goto do_SseReRg; case Iop_Sub32x8: op = Asse_SUB32; goto do_SseReRg; case Iop_Sub64x4: op = Asse_SUB64; goto do_SseReRg; case Iop_QSub8Sx32: op = Asse_QSUB8S; goto do_SseReRg; case Iop_QSub16Sx16: op = Asse_QSUB16S; goto do_SseReRg; case Iop_QSub8Ux32: op = Asse_QSUB8U; goto do_SseReRg; case Iop_QSub16Ux16: op = Asse_QSUB16U; goto do_SseReRg; do_SseReRg: { HReg argLhi, argLlo, argRhi, argRlo; iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1); iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); addInstr(env, mk_vMOVsd_RR(argLhi, dstHi)); addInstr(env, mk_vMOVsd_RR(argLlo, dstLo)); addInstr(env, AMD64Instr_SseReRg(op, argRhi, dstHi)); addInstr(env, AMD64Instr_SseReRg(op, argRlo, dstLo)); *rHi = dstHi; *rLo = dstLo; return; } case Iop_ShlN16x16: laneBits = 16; op = Asse_SHL16; goto do_SseShift; case Iop_ShlN32x8: laneBits = 32; op = Asse_SHL32; goto do_SseShift; case Iop_ShlN64x4: laneBits = 64; op = Asse_SHL64; goto do_SseShift; case Iop_SarN16x16: laneBits = 16; op = Asse_SAR16; goto do_SseShift; case Iop_SarN32x8: laneBits = 32; op = Asse_SAR32; goto do_SseShift; case Iop_ShrN16x16: laneBits = 16; op = Asse_SHR16; goto do_SseShift; case Iop_ShrN32x8: laneBits = 32; op = Asse_SHR32; goto do_SseShift; case Iop_ShrN64x4: laneBits = 64; op = Asse_SHR64; goto do_SseShift; do_SseShift: { HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); HReg gregHi, gregLo; iselDVecExpr(&gregHi, &gregLo, env, e->Iex.Binop.arg1); /* If it's a shift by an in-range immediate, generate two single instructions. */ if (e->Iex.Binop.arg2->tag == Iex_Const) { IRConst* c = e->Iex.Binop.arg2->Iex.Const.con; vassert(c->tag == Ico_U8); UInt shift = c->Ico.U8; if (shift < laneBits) { addInstr(env, mk_vMOVsd_RR(gregHi, dstHi)); addInstr(env, AMD64Instr_SseShiftN(op, shift, dstHi)); addInstr(env, mk_vMOVsd_RR(gregLo, dstLo)); addInstr(env, AMD64Instr_SseShiftN(op, shift, dstLo)); *rHi = dstHi; *rLo = dstLo; return; } } /* Otherwise we have to do it the longwinded way. */ AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2); AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP()); HReg ereg = newVRegV(env); addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0))); addInstr(env, AMD64Instr_Push(rmi)); addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0)); addInstr(env, mk_vMOVsd_RR(gregHi, dstHi)); addInstr(env, AMD64Instr_SseReRg(op, ereg, dstHi)); addInstr(env, mk_vMOVsd_RR(gregLo, dstLo)); addInstr(env, AMD64Instr_SseReRg(op, ereg, dstLo)); add_to_rsp(env, 16); *rHi = dstHi; *rLo = dstLo; return; } case Iop_V128HLtoV256: { // Curiously, there doesn't seem to be any benefit to be had here by // checking whether arg1 and arg2 are the same, in the style of how // (eg) 64HLtoV128 is handled elsewhere in this file. *rHi = iselVecExpr(env, e->Iex.Binop.arg1); *rLo = iselVecExpr(env, e->Iex.Binop.arg2); return; } case Iop_Mul32x8: fn = (HWord)h_generic_calc_Mul32x4; goto do_SseAssistedBinary; case Iop_Max32Sx8: fn = (HWord)h_generic_calc_Max32Sx4; goto do_SseAssistedBinary; case Iop_Min32Sx8: fn = (HWord)h_generic_calc_Min32Sx4; goto do_SseAssistedBinary; case Iop_Max32Ux8: fn = (HWord)h_generic_calc_Max32Ux4; goto do_SseAssistedBinary; case Iop_Min32Ux8: fn = (HWord)h_generic_calc_Min32Ux4; goto do_SseAssistedBinary; case Iop_Max16Ux16: fn = (HWord)h_generic_calc_Max16Ux8; goto do_SseAssistedBinary; case Iop_Min16Ux16: fn = (HWord)h_generic_calc_Min16Ux8; goto do_SseAssistedBinary; case Iop_Max8Sx32: fn = (HWord)h_generic_calc_Max8Sx16; goto do_SseAssistedBinary; case Iop_Min8Sx32: fn = (HWord)h_generic_calc_Min8Sx16; goto do_SseAssistedBinary; case Iop_CmpEQ64x4: fn = (HWord)h_generic_calc_CmpEQ64x2; goto do_SseAssistedBinary; case Iop_CmpGT64Sx4: fn = (HWord)h_generic_calc_CmpGT64Sx2; goto do_SseAssistedBinary; do_SseAssistedBinary: { /* RRRufff! RRRufff code is what we're generating here. Oh well. */ vassert(fn != 0); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); HReg argLhi, argLlo, argRhi, argRlo; iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1); iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2); HReg argp = newVRegI(env); /* subq $160, %rsp -- make a space*/ sub_from_rsp(env, 160); /* leaq 48(%rsp), %r_argp -- point into it */ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()), argp)); /* andq $-16, %r_argp -- 16-align the pointer */ addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm( ~(UInt)15 ), argp)); /* Prepare 3 arg regs: leaq 0(%r_argp), %rdi leaq 16(%r_argp), %rsi leaq 32(%r_argp), %rdx */ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp), hregAMD64_RDI())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp), hregAMD64_RSI())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp), hregAMD64_RDX())); /* Store the two high args, at (%rsi) and (%rdx): movupd %argLhi, 0(%rsi) movupd %argRhi, 0(%rdx) */ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi, AMD64AMode_IR(0, hregAMD64_RSI()))); addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi, AMD64AMode_IR(0, hregAMD64_RDX()))); /* Store the two low args, at 48(%rsi) and 48(%rdx): movupd %argLlo, 48(%rsi) movupd %argRlo, 48(%rdx) */ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo, AMD64AMode_IR(48, hregAMD64_RSI()))); addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo, AMD64AMode_IR(48, hregAMD64_RDX()))); /* call the helper */ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, mk_RetLoc_simple(RLPri_None) )); /* Prepare 3 arg regs: leaq 48(%r_argp), %rdi leaq 64(%r_argp), %rsi leaq 80(%r_argp), %rdx */ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, argp), hregAMD64_RDI())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp), hregAMD64_RSI())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(80, argp), hregAMD64_RDX())); /* call the helper */ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, mk_RetLoc_simple(RLPri_None) )); /* fetch the result from memory, using %r_argp, which the register allocator will keep alive across the call. */ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi, AMD64AMode_IR(0, argp))); addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo, AMD64AMode_IR(48, argp))); /* and finally, clear the space */ add_to_rsp(env, 160); *rHi = dstHi; *rLo = dstLo; return; } case Iop_Perm32x8: fn = (HWord)h_generic_calc_Perm32x8; goto do_SseAssistedBinary256; do_SseAssistedBinary256: { /* RRRufff! RRRufff code is what we're generating here. Oh well. */ vassert(fn != 0); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); HReg argLhi, argLlo, argRhi, argRlo; iselDVecExpr(&argLhi, &argLlo, env, e->Iex.Binop.arg1); iselDVecExpr(&argRhi, &argRlo, env, e->Iex.Binop.arg2); HReg argp = newVRegI(env); /* subq $160, %rsp -- make a space*/ sub_from_rsp(env, 160); /* leaq 48(%rsp), %r_argp -- point into it */ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()), argp)); /* andq $-16, %r_argp -- 16-align the pointer */ addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm( ~(UInt)15 ), argp)); /* Prepare 3 arg regs: leaq 0(%r_argp), %rdi leaq 32(%r_argp), %rsi leaq 64(%r_argp), %rdx */ addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp), hregAMD64_RDI())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp), hregAMD64_RSI())); addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(64, argp), hregAMD64_RDX())); /* Store the two args, at (%rsi) and (%rdx): movupd %argLlo, 0(%rsi) movupd %argLhi, 16(%rsi) movupd %argRlo, 0(%rdx) movupd %argRhi, 16(%rdx) */ addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLlo, AMD64AMode_IR(0, hregAMD64_RSI()))); addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argLhi, AMD64AMode_IR(16, hregAMD64_RSI()))); addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRlo, AMD64AMode_IR(0, hregAMD64_RDX()))); addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argRhi, AMD64AMode_IR(16, hregAMD64_RDX()))); /* call the helper */ addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3, mk_RetLoc_simple(RLPri_None) )); /* fetch the result from memory, using %r_argp, which the register allocator will keep alive across the call. */ addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstLo, AMD64AMode_IR(0, argp))); addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dstHi, AMD64AMode_IR(16, argp))); /* and finally, clear the space */ add_to_rsp(env, 160); *rHi = dstHi; *rLo = dstLo; return; } case Iop_I32StoF32x8: case Iop_F32toI32Sx8: { HReg argHi, argLo; iselDVecExpr(&argHi, &argLo, env, e->Iex.Binop.arg2); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); AMD64SseOp mop = e->Iex.Binop.op == Iop_I32StoF32x8 ? Asse_I2F : Asse_F2I; set_SSE_rounding_mode(env, e->Iex.Binop.arg1); addInstr(env, AMD64Instr_Sse32Fx4(mop, argHi, dstHi)); addInstr(env, AMD64Instr_Sse32Fx4(mop, argLo, dstLo)); set_SSE_rounding_default(env); *rHi = dstHi; *rLo = dstLo; return; } default: break; } /* switch (e->Iex.Binop.op) */ } /* if (e->tag == Iex_Binop) */ if (e->tag == Iex_Triop) { IRTriop *triop = e->Iex.Triop.details; switch (triop->op) { case Iop_Add64Fx4: op = Asse_ADDF; goto do_64Fx4_w_rm; case Iop_Sub64Fx4: op = Asse_SUBF; goto do_64Fx4_w_rm; case Iop_Mul64Fx4: op = Asse_MULF; goto do_64Fx4_w_rm; case Iop_Div64Fx4: op = Asse_DIVF; goto do_64Fx4_w_rm; do_64Fx4_w_rm: { HReg argLhi, argLlo, argRhi, argRlo; iselDVecExpr(&argLhi, &argLlo, env, triop->arg2); iselDVecExpr(&argRhi, &argRlo, env, triop->arg3); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); addInstr(env, mk_vMOVsd_RR(argLhi, dstHi)); addInstr(env, mk_vMOVsd_RR(argLlo, dstLo)); /* XXXROUNDINGFIXME */ /* set roundingmode here */ addInstr(env, AMD64Instr_Sse64Fx2(op, argRhi, dstHi)); addInstr(env, AMD64Instr_Sse64Fx2(op, argRlo, dstLo)); *rHi = dstHi; *rLo = dstLo; return; } case Iop_Add32Fx8: op = Asse_ADDF; goto do_32Fx8_w_rm; case Iop_Sub32Fx8: op = Asse_SUBF; goto do_32Fx8_w_rm; case Iop_Mul32Fx8: op = Asse_MULF; goto do_32Fx8_w_rm; case Iop_Div32Fx8: op = Asse_DIVF; goto do_32Fx8_w_rm; do_32Fx8_w_rm: { HReg argLhi, argLlo, argRhi, argRlo; iselDVecExpr(&argLhi, &argLlo, env, triop->arg2); iselDVecExpr(&argRhi, &argRlo, env, triop->arg3); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); addInstr(env, mk_vMOVsd_RR(argLhi, dstHi)); addInstr(env, mk_vMOVsd_RR(argLlo, dstLo)); /* XXXROUNDINGFIXME */ /* set roundingmode here */ addInstr(env, AMD64Instr_Sse32Fx4(op, argRhi, dstHi)); addInstr(env, AMD64Instr_Sse32Fx4(op, argRlo, dstLo)); *rHi = dstHi; *rLo = dstLo; return; } default: break; } /* switch (triop->op) */ } /* if (e->tag == Iex_Triop) */ if (e->tag == Iex_Qop && e->Iex.Qop.details->op == Iop_64x4toV256) { const IRExpr* arg1 = e->Iex.Qop.details->arg1; const IRExpr* arg2 = e->Iex.Qop.details->arg2; const IRExpr* arg3 = e->Iex.Qop.details->arg3; const IRExpr* arg4 = e->Iex.Qop.details->arg4; // If the args are trivially the same (tmp or const), use the same // source register for all four, and only one movq since those are // (relatively) expensive. if (areAtomsAndEqual(arg1, arg2) && areAtomsAndEqual(arg1, arg3) && areAtomsAndEqual(arg1, arg4)) { HReg q3 = iselIntExpr_R(env, e->Iex.Qop.details->arg1); HReg tmp = newVRegV(env); HReg dst = newVRegV(env); addInstr(env, AMD64Instr_SseMOVQ(q3, dst, True/*toXMM*/)); addInstr(env, mk_vMOVsd_RR(dst, tmp)); addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dst)); addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst)); *rHi = dst; *rLo = dst; } else { /* arg1 is the most significant (Q3), arg4 the least (Q0) */ HReg q3 = iselIntExpr_R(env, arg1); HReg q2 = iselIntExpr_R(env, arg2); HReg q1 = iselIntExpr_R(env, arg3); HReg q0 = iselIntExpr_R(env, arg4); HReg tmp = newVRegV(env); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); addInstr(env, AMD64Instr_SseMOVQ(q3, dstHi, True/*toXMM*/)); addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstHi)); addInstr(env, AMD64Instr_SseMOVQ(q2, tmp, True/*toXMM*/)); addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstHi)); addInstr(env, AMD64Instr_SseMOVQ(q1, dstLo, True/*toXMM*/)); addInstr(env, AMD64Instr_SseShiftN(Asse_SHL128, 64, dstLo)); addInstr(env, AMD64Instr_SseMOVQ(q0, tmp, True/*toXMM*/)); addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dstLo)); *rHi = dstHi; *rLo = dstLo; } return; } if (e->tag == Iex_ITE) { HReg r1Hi, r1Lo, r0Hi, r0Lo; iselDVecExpr(&r1Hi, &r1Lo, env, e->Iex.ITE.iftrue); iselDVecExpr(&r0Hi, &r0Lo, env, e->Iex.ITE.iffalse); HReg dstHi = newVRegV(env); HReg dstLo = newVRegV(env); addInstr(env, mk_vMOVsd_RR(r1Hi,dstHi)); addInstr(env, mk_vMOVsd_RR(r1Lo,dstLo)); AMD64CondCode cc = iselCondCode_C(env, e->Iex.ITE.cond); addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Hi, dstHi)); addInstr(env, AMD64Instr_SseCMov(cc ^ 1, r0Lo, dstLo)); *rHi = dstHi; *rLo = dstLo; return; } //avx_fail: vex_printf("iselDVecExpr (amd64, subarch = %s): can't reduce\n", LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps)); ppIRExpr(e); vpanic("iselDVecExpr_wrk"); } /*---------------------------------------------------------*/ /*--- ISEL: Statements ---*/ /*---------------------------------------------------------*/ static void iselStmt ( ISelEnv* env, IRStmt* stmt ) { if (vex_traceflags & VEX_TRACE_VCODE) { vex_printf("\n-- "); ppIRStmt(stmt); vex_printf("\n"); } switch (stmt->tag) { /* --------- LOADG (guarded load) --------- */ case Ist_LoadG: { IRLoadG* lg = stmt->Ist.LoadG.details; if (lg->end != Iend_LE) goto stmt_fail; UChar szB = 0; /* invalid */ switch (lg->cvt) { case ILGop_Ident32: szB = 4; break; case ILGop_Ident64: szB = 8; break; case ILGop_IdentV128: szB = 16; break; default: break; } if (szB == 0) goto stmt_fail; AMD64AMode* amAddr = iselIntExpr_AMode(env, lg->addr); HReg rAlt = szB == 16 ? iselVecExpr(env, lg->alt) : iselIntExpr_R(env, lg->alt); HReg rDst = lookupIRTemp(env, lg->dst); /* Get the alt value into the dst. We'll do a conditional load which overwrites it -- or not -- with loaded data. */ if (szB == 16) { addInstr(env, mk_vMOVsd_RR(rAlt, rDst)); } else { addInstr(env, mk_iMOVsd_RR(rAlt, rDst)); } AMD64CondCode cc = iselCondCode_C(env, lg->guard); if (szB == 16) { addInstr(env, AMD64Instr_SseCLoad(cc, amAddr, rDst)); } else { addInstr(env, AMD64Instr_CLoad(cc, szB, amAddr, rDst)); } return; } /* --------- STOREG (guarded store) --------- */ case Ist_StoreG: { IRStoreG* sg = stmt->Ist.StoreG.details; if (sg->end != Iend_LE) goto stmt_fail; UChar szB = 0; /* invalid */ switch (typeOfIRExpr(env->type_env, sg->data)) { case Ity_I32: szB = 4; break; case Ity_I64: szB = 8; break; case Ity_V128: szB = 16; break; default: break; } if (szB == 0) goto stmt_fail; AMD64AMode* amAddr = iselIntExpr_AMode(env, sg->addr); HReg rSrc = szB == 16 ? iselVecExpr(env, sg->data) : iselIntExpr_R(env, sg->data); AMD64CondCode cc = iselCondCode_C(env, sg->guard); if (szB == 16) { addInstr(env, AMD64Instr_SseCStore(cc, rSrc, amAddr)); } else { addInstr(env, AMD64Instr_CStore(cc, szB, rSrc, amAddr)); } return; } /* --------- STORE --------- */ case Ist_Store: { IRType tya = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr); IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.Store.data); IREndness end = stmt->Ist.Store.end; if (tya != Ity_I64 || end != Iend_LE) goto stmt_fail; if (tyd == Ity_I64) { AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr); AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data); addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am)); return; } if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) { AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr); HReg r = iselIntExpr_R(env, stmt->Ist.Store.data); addInstr(env, AMD64Instr_Store( toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)), r,am)); return; } if (tyd == Ity_F64) { AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr); HReg r = iselDblExpr(env, stmt->Ist.Store.data); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am)); return; } if (tyd == Ity_F32) { AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr); HReg r = iselFltExpr(env, stmt->Ist.Store.data); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am)); return; } if (tyd == Ity_V128) { AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr); HReg r = iselVecExpr(env, stmt->Ist.Store.data); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am)); return; } if (tyd == Ity_V256) { HReg rA = iselIntExpr_R(env, stmt->Ist.Store.addr); AMD64AMode* am0 = AMD64AMode_IR(0, rA); AMD64AMode* am16 = AMD64AMode_IR(16, rA); HReg vHi, vLo; iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Store.data); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0)); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16)); return; } break; } /* --------- PUT --------- */ case Ist_Put: { IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data); if (ty == Ity_I64) { /* We're going to write to memory, so compute the RHS into an AMD64RI. */ AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data); addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP()) )); return; } if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) { HReg r = iselIntExpr_R(env, stmt->Ist.Put.data); addInstr(env, AMD64Instr_Store( toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)), r, AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP()))); return; } if (ty == Ity_F32) { HReg f32 = iselFltExpr(env, stmt->Ist.Put.data); AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP()); set_SSE_rounding_default(env); /* paranoia */ addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am )); return; } if (ty == Ity_F64) { HReg f64 = iselDblExpr(env, stmt->Ist.Put.data); AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset, hregAMD64_RBP() ); addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am )); return; } if (ty == Ity_V128) { HReg vec = iselVecExpr(env, stmt->Ist.Put.data); AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP()); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am)); return; } if (ty == Ity_V256) { HReg vHi, vLo; iselDVecExpr(&vHi, &vLo, env, stmt->Ist.Put.data); HReg rbp = hregAMD64_RBP(); AMD64AMode* am0 = AMD64AMode_IR(stmt->Ist.Put.offset + 0, rbp); AMD64AMode* am16 = AMD64AMode_IR(stmt->Ist.Put.offset + 16, rbp); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vLo, am0)); addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vHi, am16)); return; } break; } /* --------- Indexed PUT --------- */ case Ist_PutI: { IRPutI *puti = stmt->Ist.PutI.details; AMD64AMode* am = genGuestArrayOffset( env, puti->descr, puti->ix, puti->bias ); IRType ty = typeOfIRExpr(env->type_env, puti->data); if (ty == Ity_F64) { HReg val = iselDblExpr(env, puti->data); addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am )); return; } if (ty == Ity_I8) { HReg r = iselIntExpr_R(env, puti->data); addInstr(env, AMD64Instr_Store( 1, r, am )); return; } if (ty == Ity_I64) { AMD64RI* ri = iselIntExpr_RI(env, puti->data); addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am )); return; } break; } /* --------- TMP --------- */ case Ist_WrTmp: { IRTemp tmp = stmt->Ist.WrTmp.tmp; IRType ty = typeOfIRTemp(env->type_env, tmp); /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..), compute it into an AMode and then use LEA. This usually produces fewer instructions, often because (for memcheck created IR) we get t = address-expression, (t is later used twice) and so doing this naturally turns address-expression back into an AMD64 amode. */ if (ty == Ity_I64 && stmt->Ist.WrTmp.data->tag == Iex_Binop && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) { AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data); HReg dst = lookupIRTemp(env, tmp); if (am->tag == Aam_IR && am->Aam.IR.imm == 0) { /* Hmm, iselIntExpr_AMode wimped out and just computed the value into a register. Just emit a normal reg-reg move so reg-alloc can coalesce it away in the usual way. */ HReg src = am->Aam.IR.reg; addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst)); } else { addInstr(env, AMD64Instr_Lea64(am,dst)); } return; } if (ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) { AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data); HReg dst = lookupIRTemp(env, tmp); addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst)); return; } if (ty == Ity_I128) { HReg rHi, rLo, dstHi, dstLo; iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data); lookupIRTempPair( &dstHi, &dstLo, env, tmp); addInstr(env, mk_iMOVsd_RR(rHi,dstHi) ); addInstr(env, mk_iMOVsd_RR(rLo,dstLo) ); return; } if (ty == Ity_I1) { AMD64CondCode cond = iselCondCode_C(env, stmt->Ist.WrTmp.data); HReg dst = lookupIRTemp(env, tmp); addInstr(env, AMD64Instr_Set64(cond, dst)); return; } if (ty == Ity_F64) { HReg dst = lookupIRTemp(env, tmp); HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data); addInstr(env, mk_vMOVsd_RR(src, dst)); return; } if (ty == Ity_F32) { HReg dst = lookupIRTemp(env, tmp); HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data); addInstr(env, mk_vMOVsd_RR(src, dst)); return; } if (ty == Ity_V128) { HReg dst = lookupIRTemp(env, tmp); HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data); addInstr(env, mk_vMOVsd_RR(src, dst)); return; } if (ty == Ity_V256) { HReg rHi, rLo, dstHi, dstLo; iselDVecExpr(&rHi,&rLo, env, stmt->Ist.WrTmp.data); lookupIRTempPair( &dstHi, &dstLo, env, tmp); addInstr(env, mk_vMOVsd_RR(rHi,dstHi) ); addInstr(env, mk_vMOVsd_RR(rLo,dstLo) ); return; } break; } /* --------- Call to DIRTY helper --------- */ case Ist_Dirty: { IRDirty* d = stmt->Ist.Dirty.details; /* Figure out the return type, if any. */ IRType retty = Ity_INVALID; if (d->tmp != IRTemp_INVALID) retty = typeOfIRTemp(env->type_env, d->tmp); /* Throw out any return types we don't know about. */ Bool retty_ok = False; switch (retty) { case Ity_INVALID: /* function doesn't return anything */ case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: case Ity_V128: case Ity_V256: retty_ok = True; break; default: break; } if (!retty_ok) break; /* will go to stmt_fail: */ /* Marshal args, do the call, and set the return value to 0x555..555 if this is a conditional call that returns a value and the call is skipped. */ UInt addToSp = 0; RetLoc rloc = mk_RetLoc_INVALID(); doHelperCall( &addToSp, &rloc, env, d->guard, d->cee, retty, d->args ); vassert(is_sane_RetLoc(rloc)); /* Now figure out what to do with the returned value, if any. */ switch (retty) { case Ity_INVALID: { /* No return value. Nothing to do. */ vassert(d->tmp == IRTemp_INVALID); vassert(rloc.pri == RLPri_None); vassert(addToSp == 0); return; } case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8: { /* The returned value is in %rax. Park it in the register associated with tmp. */ vassert(rloc.pri == RLPri_Int); vassert(addToSp == 0); HReg dst = lookupIRTemp(env, d->tmp); addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) ); return; } case Ity_V128: { /* The returned value is on the stack, and rloc.spOff tells us where. Fish it off the stack and then move the stack pointer upwards to clear it, as directed by doHelperCall. */ vassert(rloc.pri == RLPri_V128SpRel); vassert(addToSp >= 16); HReg dst = lookupIRTemp(env, d->tmp); AMD64AMode* am = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP()); addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am )); add_to_rsp(env, addToSp); return; } case Ity_V256: { /* See comments for Ity_V128. */ vassert(rloc.pri == RLPri_V256SpRel); vassert(addToSp >= 32); HReg dstLo, dstHi; lookupIRTempPair(&dstHi, &dstLo, env, d->tmp); AMD64AMode* amLo = AMD64AMode_IR(rloc.spOff, hregAMD64_RSP()); addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstLo, amLo )); AMD64AMode* amHi = AMD64AMode_IR(rloc.spOff+16, hregAMD64_RSP()); addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dstHi, amHi )); add_to_rsp(env, addToSp); return; } default: /*NOTREACHED*/ vassert(0); } break; } /* --------- MEM FENCE --------- */ case Ist_MBE: switch (stmt->Ist.MBE.event) { case Imbe_Fence: addInstr(env, AMD64Instr_MFence()); return; default: break; } break; /* --------- ACAS --------- */ case Ist_CAS: if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) { /* "normal" singleton CAS */ UChar sz; IRCAS* cas = stmt->Ist.CAS.details; IRType ty = typeOfIRExpr(env->type_env, cas->dataLo); /* get: cas->expd into %rax, and cas->data into %rbx */ AMD64AMode* am = iselIntExpr_AMode(env, cas->addr); HReg rData = iselIntExpr_R(env, cas->dataLo); HReg rExpd = iselIntExpr_R(env, cas->expdLo); HReg rOld = lookupIRTemp(env, cas->oldLo); vassert(cas->expdHi == NULL); vassert(cas->dataHi == NULL); addInstr(env, mk_iMOVsd_RR(rExpd, rOld)); addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX())); addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX())); switch (ty) { case Ity_I64: sz = 8; break; case Ity_I32: sz = 4; break; case Ity_I16: sz = 2; break; case Ity_I8: sz = 1; break; default: goto unhandled_cas; } addInstr(env, AMD64Instr_ACAS(am, sz)); addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOld)); return; } else { /* double CAS */ UChar sz; IRCAS* cas = stmt->Ist.CAS.details; IRType ty = typeOfIRExpr(env->type_env, cas->dataLo); /* only 32-bit and 64-bit allowed in this case */ /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */ /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */ AMD64AMode* am = iselIntExpr_AMode(env, cas->addr); HReg rDataHi = iselIntExpr_R(env, cas->dataHi); HReg rDataLo = iselIntExpr_R(env, cas->dataLo); HReg rExpdHi = iselIntExpr_R(env, cas->expdHi); HReg rExpdLo = iselIntExpr_R(env, cas->expdLo); HReg rOldHi = lookupIRTemp(env, cas->oldHi); HReg rOldLo = lookupIRTemp(env, cas->oldLo); switch (ty) { case Ity_I64: if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16)) goto unhandled_cas; /* we'd have to generate cmpxchg16b, but the host doesn't support that */ sz = 8; break; case Ity_I32: sz = 4; break; default: goto unhandled_cas; } addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi)); addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo)); addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX())); addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX())); addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX())); addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX())); addInstr(env, AMD64Instr_DACAS(am, sz)); addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RDX(), rOldHi)); addInstr(env, AMD64Instr_CMov64(Acc_NZ, hregAMD64_RAX(), rOldLo)); return; } unhandled_cas: break; /* --------- INSTR MARK --------- */ /* Doesn't generate any executable code ... */ case Ist_IMark: return; /* --------- ABI HINT --------- */ /* These have no meaning (denotation in the IR) and so we ignore them ... if any actually made it this far. */ case Ist_AbiHint: return; /* --------- NO-OP --------- */ case Ist_NoOp: return; /* --------- EXIT --------- */ case Ist_Exit: { if (stmt->Ist.Exit.dst->tag != Ico_U64) vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value"); AMD64CondCode cc = iselCondCode_C(env, stmt->Ist.Exit.guard); AMD64AMode* amRIP = AMD64AMode_IR(stmt->Ist.Exit.offsIP, hregAMD64_RBP()); /* Case: boring transfer to known address */ if (stmt->Ist.Exit.jk == Ijk_Boring) { if (env->chainingAllowed) { /* .. almost always true .. */ /* Skip the event check at the dst if this is a forwards edge. */ Bool toFastEP = ((Addr64)stmt->Ist.Exit.dst->Ico.U64) > env->max_ga; if (0) vex_printf("%s", toFastEP ? "Y" : ","); addInstr(env, AMD64Instr_XDirect(stmt->Ist.Exit.dst->Ico.U64, amRIP, cc, toFastEP)); } else { /* .. very occasionally .. */ /* We can't use chaining, so ask for an assisted transfer, as that's the only alternative that is allowable. */ HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst)); addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, Ijk_Boring)); } return; } /* Case: assisted transfer to arbitrary address */ switch (stmt->Ist.Exit.jk) { /* Keep this list in sync with that in iselNext below */ case Ijk_ClientReq: case Ijk_EmWarn: case Ijk_NoDecode: case Ijk_NoRedir: case Ijk_SigSEGV: case Ijk_SigTRAP: case Ijk_Sys_syscall: case Ijk_Sys_int210: case Ijk_InvalICache: case Ijk_Yield: { HReg r = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst)); addInstr(env, AMD64Instr_XAssisted(r, amRIP, cc, stmt->Ist.Exit.jk)); return; } default: break; } /* Do we ever expect to see any other kind? */ goto stmt_fail; } default: break; } stmt_fail: ppIRStmt(stmt); vpanic("iselStmt(amd64)"); } /*---------------------------------------------------------*/ /*--- ISEL: Basic block terminators (Nexts) ---*/ /*---------------------------------------------------------*/ static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk, Int offsIP ) { if (vex_traceflags & VEX_TRACE_VCODE) { vex_printf( "\n-- PUT(%d) = ", offsIP); ppIRExpr( next ); vex_printf( "; exit-"); ppIRJumpKind(jk); vex_printf( "\n"); } /* Case: boring transfer to known address */ if (next->tag == Iex_Const) { IRConst* cdst = next->Iex.Const.con; vassert(cdst->tag == Ico_U64); if (jk == Ijk_Boring || jk == Ijk_Call) { /* Boring transfer to known address */ AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP()); if (env->chainingAllowed) { /* .. almost always true .. */ /* Skip the event check at the dst if this is a forwards edge. */ Bool toFastEP = ((Addr64)cdst->Ico.U64) > env->max_ga; if (0) vex_printf("%s", toFastEP ? "X" : "."); addInstr(env, AMD64Instr_XDirect(cdst->Ico.U64, amRIP, Acc_ALWAYS, toFastEP)); } else { /* .. very occasionally .. */ /* We can't use chaining, so ask for an indirect transfer, as that's the cheapest alternative that is allowable. */ HReg r = iselIntExpr_R(env, next); addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, Ijk_Boring)); } return; } } /* Case: call/return (==boring) transfer to any address */ switch (jk) { case Ijk_Boring: case Ijk_Ret: case Ijk_Call: { HReg r = iselIntExpr_R(env, next); AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP()); if (env->chainingAllowed) { addInstr(env, AMD64Instr_XIndir(r, amRIP, Acc_ALWAYS)); } else { addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, Ijk_Boring)); } return; } default: break; } /* Case: assisted transfer to arbitrary address */ switch (jk) { /* Keep this list in sync with that for Ist_Exit above */ case Ijk_ClientReq: case Ijk_EmWarn: case Ijk_NoDecode: case Ijk_NoRedir: case Ijk_SigSEGV: case Ijk_SigTRAP: case Ijk_Sys_syscall: case Ijk_Sys_int210: case Ijk_InvalICache: case Ijk_Yield: { HReg r = iselIntExpr_R(env, next); AMD64AMode* amRIP = AMD64AMode_IR(offsIP, hregAMD64_RBP()); addInstr(env, AMD64Instr_XAssisted(r, amRIP, Acc_ALWAYS, jk)); return; } default: break; } vex_printf( "\n-- PUT(%d) = ", offsIP); ppIRExpr( next ); vex_printf( "; exit-"); ppIRJumpKind(jk); vex_printf( "\n"); vassert(0); // are we expecting any other kind? } /*---------------------------------------------------------*/ /*--- Insn selector top-level ---*/ /*---------------------------------------------------------*/ /* Translate an entire SB to amd64 code. */ HInstrArray* iselSB_AMD64 ( const IRSB* bb, VexArch arch_host, const VexArchInfo* archinfo_host, const VexAbiInfo* vbi/*UNUSED*/, Int offs_Host_EvC_Counter, Int offs_Host_EvC_FailAddr, Bool chainingAllowed, Bool addProfInc, Addr max_ga ) { Int i, j; HReg hreg, hregHI; ISelEnv* env; UInt hwcaps_host = archinfo_host->hwcaps; AMD64AMode *amCounter, *amFailAddr; /* sanity ... */ vassert(arch_host == VexArchAMD64); vassert(0 == (hwcaps_host & ~(VEX_HWCAPS_AMD64_SSE3 | VEX_HWCAPS_AMD64_SSSE3 | VEX_HWCAPS_AMD64_CX16 | VEX_HWCAPS_AMD64_LZCNT | VEX_HWCAPS_AMD64_AVX | VEX_HWCAPS_AMD64_RDTSCP | VEX_HWCAPS_AMD64_BMI | VEX_HWCAPS_AMD64_AVX2 | VEX_HWCAPS_AMD64_F16C | VEX_HWCAPS_AMD64_RDRAND | VEX_HWCAPS_AMD64_RDSEED))); /* Check that the host's endianness is as expected. */ vassert(archinfo_host->endness == VexEndnessLE); /* Make up an initial environment to use. */ env = LibVEX_Alloc_inline(sizeof(ISelEnv)); env->vreg_ctr = 0; /* Set up output code array. */ env->code = newHInstrArray(); /* Copy BB's type env. */ env->type_env = bb->tyenv; /* Make up an IRTemp -> virtual HReg mapping. This doesn't change as we go along. */ env->n_vregmap = bb->tyenv->types_used; env->vregmap = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg)); env->vregmapHI = LibVEX_Alloc_inline(env->n_vregmap * sizeof(HReg)); /* and finally ... */ env->chainingAllowed = chainingAllowed; env->hwcaps = hwcaps_host; env->max_ga = max_ga; /* For each IR temporary, allocate a suitably-kinded virtual register. */ j = 0; for (i = 0; i < env->n_vregmap; i++) { hregHI = hreg = INVALID_HREG; switch (bb->tyenv->types[i]) { case Ity_I1: case Ity_I8: case Ity_I16: case Ity_I32: case Ity_I64: hreg = mkHReg(True, HRcInt64, 0, j++); break; case Ity_I128: hreg = mkHReg(True, HRcInt64, 0, j++); hregHI = mkHReg(True, HRcInt64, 0, j++); break; case Ity_F32: case Ity_F64: case Ity_V128: hreg = mkHReg(True, HRcVec128, 0, j++); break; case Ity_V256: hreg = mkHReg(True, HRcVec128, 0, j++); hregHI = mkHReg(True, HRcVec128, 0, j++); break; default: ppIRType(bb->tyenv->types[i]); vpanic("iselBB(amd64): IRTemp type"); } env->vregmap[i] = hreg; env->vregmapHI[i] = hregHI; } env->vreg_ctr = j; /* The very first instruction must be an event check. */ amCounter = AMD64AMode_IR(offs_Host_EvC_Counter, hregAMD64_RBP()); amFailAddr = AMD64AMode_IR(offs_Host_EvC_FailAddr, hregAMD64_RBP()); addInstr(env, AMD64Instr_EvCheck(amCounter, amFailAddr)); /* Possibly a block counter increment (for profiling). At this point we don't know the address of the counter, so just pretend it is zero. It will have to be patched later, but before this translation is used, by a call to LibVEX_patchProfCtr. */ if (addProfInc) { addInstr(env, AMD64Instr_ProfInc()); } /* Ok, finally we can iterate over the statements. */ for (i = 0; i < bb->stmts_used; i++) if (bb->stmts[i]) iselStmt(env, bb->stmts[i]); iselNext(env, bb->next, bb->jumpkind, bb->offsIP); /* record the number of vregs we used. */ env->code->n_vregs = env->vreg_ctr; return env->code; } /*---------------------------------------------------------------*/ /*--- end host_amd64_isel.c ---*/ /*---------------------------------------------------------------*/