/* * Copyright (C) 2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "AirEmitShuffle.h" #if ENABLE(B3_JIT) #include "AirInstInlines.h" #include "AirRegisterPriority.h" #include #include namespace JSC { namespace B3 { namespace Air { namespace { bool verbose = false; template Tmp findPossibleScratch(Arg::Type type, const Functor& functor) { for (Reg reg : regsInPriorityOrder(type)) { Tmp tmp(reg); if (functor(tmp)) return tmp; } return Tmp(); } Tmp findPossibleScratch(Arg::Type type, const Arg& arg1, const Arg& arg2) { return findPossibleScratch( type, [&] (Tmp tmp) -> bool { return !arg1.usesTmp(tmp) && !arg2.usesTmp(tmp); }); } // Example: (a => b, b => a, a => c, b => d) struct Rotate { Vector loop; // in the example, this is the loop: (a => b, b => a) Vector fringe; // in the example, these are the associated shifts: (a => c, b => d) }; } // anonymous namespace void ShufflePair::dump(PrintStream& out) const { out.print(width(), ":", src(), "=>", dst()); } Inst createShuffle(Value* origin, const Vector& pairs) { Inst result(Shuffle, origin); for (const ShufflePair& pair : pairs) result.append(pair.src(), pair.dst(), Arg::widthArg(pair.width())); return result; } Vector emitShuffle( Vector pairs, std::array scratches, Arg::Type type, Value* origin) { if (verbose) { dataLog( "Dealing with pairs: ", listDump(pairs), " and scratches ", scratches[0], ", ", scratches[1], "\n"); } pairs.removeAllMatching( [&] (const ShufflePair& pair) -> bool { return pair.src() == pair.dst(); }); // First validate that this is the kind of shuffle that we know how to deal with. #if !ASSERT_DISABLED for (const ShufflePair& pair : pairs) { ASSERT(pair.src().isType(type)); ASSERT(pair.dst().isType(type)); ASSERT(pair.dst().isTmp() || pair.dst().isMemory()); } #endif // !ASSERT_DISABLED // There are two possible kinds of operations that we will do: // // - Shift. Example: (a => b, b => c). We emit this as "Move b, c; Move a, b". This only requires // scratch registers if there are memory->memory moves. We want to find as many of these as // possible because they are cheaper. Note that shifts can involve the same source mentioned // multiple times. Example: (a => b, a => c, b => d, b => e). // // - Rotate. Example: (a => b, b => a). We want to emit this as "Swap a, b", but that instruction // may not be available, in which case we may need a scratch register or a scratch memory // location. A gnarlier example is (a => b, b => c, c => a). We can emit this as "Swap b, c; // Swap a, b". Note that swapping has to be careful about differing widths. // // Note that a rotate can have "fringe". For example, we might have (a => b, b => a, a =>c, // b => d). This has a rotate loop (a => b, b => a) and some fringe (a => c, b => d). We treat // the whole thing as a single rotate. // // We will find multiple disjoint such operations. We can execute them in any order. // We interpret these as Moves that should be executed backwards. All shifts are keyed by their // starting source. HashMap> shifts; // We interpret these as Swaps over src()'s that should be executed backwards, i.e. for a list // of size 3 we would do "Swap list[1].src(), list[2].src(); Swap list[0].src(), list[1].src()". // Note that we actually can't do that if the widths don't match or other bad things happen. // But, prior to executing all of that, we need to execute the fringe: the shifts comming off the // rotate. Vector rotates; { HashMap> mapping; for (const ShufflePair& pair : pairs) mapping.add(pair.src(), Vector()).iterator->value.append(pair); Vector currentPairs; while (!mapping.isEmpty()) { ASSERT(currentPairs.isEmpty()); Arg originalSrc = mapping.begin()->key; ASSERT(!shifts.contains(originalSrc)); if (verbose) dataLog("Processing from ", originalSrc, "\n"); GraphNodeWorklist worklist; worklist.push(originalSrc); while (Arg src = worklist.pop()) { HashMap>::iterator iter = mapping.find(src); if (iter == mapping.end()) { // With a shift it's possible that we previously built the tail of this shift. // See if that's the case now. if (verbose) dataLog("Trying to append shift at ", src, "\n"); currentPairs.appendVector(shifts.take(src)); continue; } Vector pairs = WTFMove(iter->value); mapping.remove(iter); for (const ShufflePair& pair : pairs) { currentPairs.append(pair); ASSERT(pair.src() == src); worklist.push(pair.dst()); } } ASSERT(currentPairs.size()); ASSERT(currentPairs[0].src() == originalSrc); if (verbose) dataLog("currentPairs = ", listDump(currentPairs), "\n"); bool isRotate = false; for (const ShufflePair& pair : currentPairs) { if (pair.dst() == originalSrc) { isRotate = true; break; } } if (isRotate) { if (verbose) dataLog("It's a rotate.\n"); Rotate rotate; // The common case is that the rotate does not have fringe. The only way to // check for this is to examine the whole rotate. bool ok; if (currentPairs.last().dst() == originalSrc) { ok = true; for (unsigned i = currentPairs.size() - 1; i--;) ok &= currentPairs[i].dst() == currentPairs[i + 1].src(); } else ok = false; if (ok) rotate.loop = WTFMove(currentPairs); else { // This is the slow path. The rotate has fringe. HashMap dstMapping; for (const ShufflePair& pair : currentPairs) dstMapping.add(pair.dst(), pair); ShufflePair pair = dstMapping.take(originalSrc); for (;;) { rotate.loop.append(pair); auto iter = dstMapping.find(pair.src()); if (iter == dstMapping.end()) break; pair = iter->value; dstMapping.remove(iter); } rotate.loop.reverse(); // Make sure that the fringe appears in the same order as how it appeared in the // currentPairs, since that's the DFS order. for (const ShufflePair& pair : currentPairs) { // But of course we only include it if it's not in the loop. if (dstMapping.contains(pair.dst())) rotate.fringe.append(pair); } } // If the graph search terminates because we returned to the first source, then the // pair list has to have a very particular shape. for (unsigned i = rotate.loop.size() - 1; i--;) ASSERT(rotate.loop[i].dst() == rotate.loop[i + 1].src()); rotates.append(WTFMove(rotate)); currentPairs.resize(0); } else { if (verbose) dataLog("It's a shift.\n"); shifts.add(originalSrc, WTFMove(currentPairs)); } } } if (verbose) { dataLog("Shifts:\n"); for (auto& entry : shifts) dataLog(" ", entry.key, ": ", listDump(entry.value), "\n"); dataLog("Rotates:\n"); for (auto& rotate : rotates) dataLog(" loop = ", listDump(rotate.loop), ", fringe = ", listDump(rotate.fringe), "\n"); } // In the worst case, we need two scratch registers. The way we do this is that the client passes // us what scratch registers he happens to have laying around. We will need scratch registers in // the following cases: // // - Shuffle pairs where both src and dst refer to memory. // - Rotate when no Swap instruction is available. // // Lucky for us, we are guaranteed to have extra scratch registers anytime we have a Shift that // ends with a register. We search for such a register right now. auto moveForWidth = [&] (Arg::Width width) -> Opcode { switch (width) { case Arg::Width32: return type == Arg::GP ? Move32 : MoveFloat; case Arg::Width64: return type == Arg::GP ? Move : MoveDouble; default: RELEASE_ASSERT_NOT_REACHED(); } }; Opcode conservativeMove = moveForWidth(Arg::conservativeWidth(type)); // We will emit things in reverse. We maintain a list of packs of instructions, and then we emit // append them together in reverse (for example the thing at the end of resultPacks is placed // first). This is useful because the last thing we emit frees up its destination registers, so // it affects how we emit things before it. Vector> resultPacks; Vector result; auto commitResult = [&] () { resultPacks.append(WTFMove(result)); }; auto getScratch = [&] (unsigned index, Tmp possibleScratch) -> Tmp { if (scratches[index].isTmp()) return scratches[index].tmp(); if (!possibleScratch) return Tmp(); result.append(Inst(conservativeMove, origin, possibleScratch, scratches[index])); return possibleScratch; }; auto returnScratch = [&] (unsigned index, Tmp tmp) { if (Arg(tmp) != scratches[index]) result.append(Inst(conservativeMove, origin, scratches[index], tmp)); }; auto handleShiftPair = [&] (const ShufflePair& pair, unsigned scratchIndex) { Opcode move = moveForWidth(pair.width()); if (!isValidForm(move, pair.src().kind(), pair.dst().kind())) { Tmp scratch = getScratch(scratchIndex, findPossibleScratch(type, pair.src(), pair.dst())); RELEASE_ASSERT(scratch); if (isValidForm(move, pair.src().kind(), Arg::Tmp)) result.append(Inst(moveForWidth(pair.width()), origin, pair.src(), scratch)); else { ASSERT(pair.src().isSomeImm()); ASSERT(move == Move32); result.append(Inst(Move, origin, Arg::bigImm(pair.src().value()), scratch)); } result.append(Inst(moveForWidth(pair.width()), origin, scratch, pair.dst())); returnScratch(scratchIndex, scratch); return; } result.append(Inst(move, origin, pair.src(), pair.dst())); }; auto handleShift = [&] (Vector& shift) { // FIXME: We could optimize the spill behavior of the shifter by checking if any of the // shifts need spills. If they do, then we could try to get a register out here. Note that // this may fail where the current strategy succeeds: out here we need a register that does // not interfere with any of the shifts, while the current strategy only needs to find a // scratch register that does not interfer with a particular shift. So, this optimization // will be opportunistic: if it succeeds, then the individual shifts can use that scratch, // otherwise they will do what they do now. for (unsigned i = shift.size(); i--;) handleShiftPair(shift[i], 0); Arg lastDst = shift.last().dst(); if (lastDst.isTmp()) { for (Arg& scratch : scratches) { ASSERT(scratch != lastDst); if (!scratch.isTmp()) { scratch = lastDst; break; } } } }; // First handle shifts whose last destination is a tmp because these free up scratch registers. // These end up last in the final sequence, so the final destination of these shifts will be // available as a scratch location for anything emitted prior (so, after, since we're emitting in // reverse). for (auto& entry : shifts) { Vector& shift = entry.value; if (shift.last().dst().isTmp()) handleShift(shift); commitResult(); } // Now handle the rest of the shifts. for (auto& entry : shifts) { Vector& shift = entry.value; if (!shift.last().dst().isTmp()) handleShift(shift); commitResult(); } for (Rotate& rotate : rotates) { if (!rotate.fringe.isEmpty()) { // Make sure we do the fringe first! This won't clobber any of the registers that are // part of the rotation. handleShift(rotate.fringe); } bool canSwap = false; Opcode swap = Oops; Arg::Width swapWidth = Arg::Width8; // bogus value // Currently, the swap instruction is not available for floating point on any architecture we // support. if (type == Arg::GP) { // Figure out whether we will be doing 64-bit swaps or 32-bit swaps. If we have a mix of // widths we handle that by fixing up the relevant register with zero-extends. swap = Swap32; swapWidth = Arg::Width32; bool hasMemory = false; bool hasIndex = false; for (ShufflePair& pair : rotate.loop) { switch (pair.width()) { case Arg::Width32: break; case Arg::Width64: swap = Swap64; swapWidth = Arg::Width64; break; default: RELEASE_ASSERT_NOT_REACHED(); break; } hasMemory |= pair.src().isMemory() || pair.dst().isMemory(); hasIndex |= pair.src().isIndex() || pair.dst().isIndex(); } canSwap = isValidForm(swap, Arg::Tmp, Arg::Tmp); // We can totally use swaps even if there are shuffles involving memory. But, we play it // safe in that case. There are corner cases we don't handle, and our ability to do it is // contingent upon swap form availability. if (hasMemory) { canSwap &= isValidForm(swap, Arg::Tmp, Arg::Addr); // We don't take the swapping path if there is a mix of widths and some of the // shuffles involve memory. That gets too confusing. We might be able to relax this // to only bail if there are subwidth pairs involving memory, but I haven't thought // about it very hard. Anyway, this case is not common: rotates involving memory // don't arise for function calls, and they will only happen for rotates in user code // if some of the variables get spilled. It's hard to imagine a program that rotates // data around in variables while also doing a combination of uint32->uint64 and // int64->int32 casts. for (ShufflePair& pair : rotate.loop) canSwap &= pair.width() == swapWidth; } if (hasIndex) canSwap &= isValidForm(swap, Arg::Tmp, Arg::Index); } if (canSwap) { for (unsigned i = rotate.loop.size() - 1; i--;) { Arg left = rotate.loop[i].src(); Arg right = rotate.loop[i + 1].src(); if (left.isMemory() && right.isMemory()) { // Note that this is a super rare outcome. Rotates are rare. Spills are rare. // Moving data between two spills is rare. To get here a lot of rare stuff has to // all happen at once. Tmp scratch = getScratch(0, findPossibleScratch(type, left, right)); RELEASE_ASSERT(scratch); result.append(Inst(moveForWidth(swapWidth), origin, left, scratch)); result.append(Inst(swap, origin, scratch, right)); result.append(Inst(moveForWidth(swapWidth), origin, scratch, left)); returnScratch(0, scratch); continue; } if (left.isMemory()) std::swap(left, right); result.append(Inst(swap, origin, left, right)); } for (ShufflePair pair : rotate.loop) { if (pair.width() == swapWidth) continue; RELEASE_ASSERT(pair.width() == Arg::Width32); RELEASE_ASSERT(swapWidth == Arg::Width64); RELEASE_ASSERT(pair.dst().isTmp()); // Need to do an extra zero extension. result.append(Inst(Move32, origin, pair.dst(), pair.dst())); } } else { // We can treat this as a shift so long as we take the last destination (i.e. first // source) and save it first. Then we handle the first entry in the pair in the rotate // specially, after we restore the last destination. This requires some special care to // find a scratch register. It's possible that we have a rotate that uses the entire // available register file. Tmp scratch = findPossibleScratch( type, [&] (Tmp tmp) -> bool { for (ShufflePair pair : rotate.loop) { if (pair.src().usesTmp(tmp)) return false; if (pair.dst().usesTmp(tmp)) return false; } return true; }); // NOTE: This is the most likely use of scratch registers. scratch = getScratch(0, scratch); // We may not have found a scratch register. When this happens, we can just use the spill // slot directly. Arg rotateSave = scratch ? Arg(scratch) : scratches[0]; handleShiftPair( ShufflePair(rotate.loop.last().dst(), rotateSave, rotate.loop[0].width()), 1); for (unsigned i = rotate.loop.size(); i-- > 1;) handleShiftPair(rotate.loop[i], 1); handleShiftPair( ShufflePair(rotateSave, rotate.loop[0].dst(), rotate.loop[0].width()), 1); if (scratch) returnScratch(0, scratch); } commitResult(); } ASSERT(result.isEmpty()); for (unsigned i = resultPacks.size(); i--;) result.appendVector(resultPacks[i]); return result; } Vector emitShuffle( const Vector& pairs, const std::array& gpScratch, const std::array& fpScratch, Value* origin) { Vector gpPairs; Vector fpPairs; for (const ShufflePair& pair : pairs) { if (pair.src().isMemory() && pair.dst().isMemory() && pair.width() > Arg::pointerWidth()) { // 8-byte memory-to-memory moves on a 32-bit platform are best handled as float moves. fpPairs.append(pair); } else if (pair.src().isGP() && pair.dst().isGP()) { // This means that gpPairs gets memory-to-memory shuffles. The assumption is that we // can do that more efficiently using GPRs, except in the special case above. gpPairs.append(pair); } else fpPairs.append(pair); } Vector result; result.appendVector(emitShuffle(gpPairs, gpScratch, Arg::GP, origin)); result.appendVector(emitShuffle(fpPairs, fpScratch, Arg::FP, origin)); return result; } } } } // namespace JSC::B3::Air #endif // ENABLE(B3_JIT)