/*
 * Copyright (C) 2016 Apple Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

#include "config.h"
#include "AirEmitShuffle.h"

#if ENABLE(B3_JIT)

#include "AirInstInlines.h"
#include "AirRegisterPriority.h"
#include <wtf/GraphNodeWorklist.h>
#include <wtf/ListDump.h>

namespace JSC { namespace B3 { namespace Air {

namespace {

bool verbose = false;

template<typename Functor>
Tmp findPossibleScratch(Arg::Type type, const Functor& functor) {
    for (Reg reg : regsInPriorityOrder(type)) {
        Tmp tmp(reg);
        if (functor(tmp))
            return tmp;
    }
    return Tmp();
}

Tmp findPossibleScratch(Arg::Type type, const Arg& arg1, const Arg& arg2) {
    return findPossibleScratch(
        type,
        [&] (Tmp tmp) -> bool {
            return !arg1.usesTmp(tmp) && !arg2.usesTmp(tmp);
        });
}

// Example: (a => b, b => a, a => c, b => d)
struct Rotate {
    Vector<ShufflePair> loop; // in the example, this is the loop: (a => b, b => a)
    Vector<ShufflePair> fringe; // in the example, these are the associated shifts: (a => c, b => d)
};

} // anonymous namespace

void ShufflePair::dump(PrintStream& out) const
{
    out.print(width(), ":", src(), "=>", dst());
}

Inst createShuffle(Value* origin, const Vector<ShufflePair>& pairs)
{
    Inst result(Shuffle, origin);
    for (const ShufflePair& pair : pairs)
        result.append(pair.src(), pair.dst(), Arg::widthArg(pair.width()));
    return result;
}

Vector<Inst> emitShuffle(
    Vector<ShufflePair> pairs, std::array<Arg, 2> scratches, Arg::Type type, Value* origin)
{
    if (verbose) {
        dataLog(
            "Dealing with pairs: ", listDump(pairs), " and scratches ", scratches[0], ", ",
            scratches[1], "\n");
    }
    
    pairs.removeAllMatching(
        [&] (const ShufflePair& pair) -> bool {
            return pair.src() == pair.dst();
        });
    
    // First validate that this is the kind of shuffle that we know how to deal with.
#if !ASSERT_DISABLED
    for (const ShufflePair& pair : pairs) {
        ASSERT(pair.src().isType(type));
        ASSERT(pair.dst().isType(type));
        ASSERT(pair.dst().isTmp() || pair.dst().isMemory());
    }
#endif // !ASSERT_DISABLED

    // There are two possible kinds of operations that we will do:
    //
    // - Shift. Example: (a => b, b => c). We emit this as "Move b, c; Move a, b". This only requires
    //   scratch registers if there are memory->memory moves. We want to find as many of these as
    //   possible because they are cheaper. Note that shifts can involve the same source mentioned
    //   multiple times. Example: (a => b, a => c, b => d, b => e).
    //
    // - Rotate. Example: (a => b, b => a). We want to emit this as "Swap a, b", but that instruction
    //   may not be available, in which case we may need a scratch register or a scratch memory
    //   location. A gnarlier example is (a => b, b => c, c => a). We can emit this as "Swap b, c;
    //   Swap a, b". Note that swapping has to be careful about differing widths.
    //
    // Note that a rotate can have "fringe". For example, we might have (a => b, b => a, a =>c,
    // b => d). This has a rotate loop (a => b, b => a) and some fringe (a => c, b => d). We treat
    // the whole thing as a single rotate.
    //
    // We will find multiple disjoint such operations. We can execute them in any order.

    // We interpret these as Moves that should be executed backwards. All shifts are keyed by their
    // starting source.
    HashMap<Arg, Vector<ShufflePair>> shifts;

    // We interpret these as Swaps over src()'s that should be executed backwards, i.e. for a list
    // of size 3 we would do "Swap list[1].src(), list[2].src(); Swap list[0].src(), list[1].src()".
    // Note that we actually can't do that if the widths don't match or other bad things happen.
    // But, prior to executing all of that, we need to execute the fringe: the shifts comming off the
    // rotate.
    Vector<Rotate> rotates;

    {
        HashMap<Arg, Vector<ShufflePair>> mapping;
        for (const ShufflePair& pair : pairs)
            mapping.add(pair.src(), Vector<ShufflePair>()).iterator->value.append(pair);

        Vector<ShufflePair> currentPairs;

        while (!mapping.isEmpty()) {
            ASSERT(currentPairs.isEmpty());
            Arg originalSrc = mapping.begin()->key;
            ASSERT(!shifts.contains(originalSrc));
            if (verbose)
                dataLog("Processing from ", originalSrc, "\n");
            
            GraphNodeWorklist<Arg> worklist;
            worklist.push(originalSrc);
            while (Arg src = worklist.pop()) {
                HashMap<Arg, Vector<ShufflePair>>::iterator iter = mapping.find(src);
                if (iter == mapping.end()) {
                    // With a shift it's possible that we previously built the tail of this shift.
                    // See if that's the case now.
                    if (verbose)
                        dataLog("Trying to append shift at ", src, "\n");
                    currentPairs.appendVector(shifts.take(src));
                    continue;
                }
                Vector<ShufflePair> pairs = WTFMove(iter->value);
                mapping.remove(iter);

                for (const ShufflePair& pair : pairs) {
                    currentPairs.append(pair);
                    ASSERT(pair.src() == src);
                    worklist.push(pair.dst());
                }
            }

            ASSERT(currentPairs.size());
            ASSERT(currentPairs[0].src() == originalSrc);

            if (verbose)
                dataLog("currentPairs = ", listDump(currentPairs), "\n");

            bool isRotate = false;
            for (const ShufflePair& pair : currentPairs) {
                if (pair.dst() == originalSrc) {
                    isRotate = true;
                    break;
                }
            }

            if (isRotate) {
                if (verbose)
                    dataLog("It's a rotate.\n");
                Rotate rotate;

                // The common case is that the rotate does not have fringe. The only way to
                // check for this is to examine the whole rotate.
                bool ok;
                if (currentPairs.last().dst() == originalSrc) {
                    ok = true;
                    for (unsigned i = currentPairs.size() - 1; i--;)
                        ok &= currentPairs[i].dst() == currentPairs[i + 1].src();
                } else
                    ok = false;
                
                if (ok)
                    rotate.loop = WTFMove(currentPairs);
                else {
                    // This is the slow path. The rotate has fringe.
                    
                    HashMap<Arg, ShufflePair> dstMapping;
                    for (const ShufflePair& pair : currentPairs)
                        dstMapping.add(pair.dst(), pair);

                    ShufflePair pair = dstMapping.take(originalSrc);
                    for (;;) {
                        rotate.loop.append(pair);

                        auto iter = dstMapping.find(pair.src());
                        if (iter == dstMapping.end())
                            break;
                        pair = iter->value;
                        dstMapping.remove(iter);
                    }

                    rotate.loop.reverse();

                    // Make sure that the fringe appears in the same order as how it appeared in the
                    // currentPairs, since that's the DFS order.
                    for (const ShufflePair& pair : currentPairs) {
                        // But of course we only include it if it's not in the loop.
                        if (dstMapping.contains(pair.dst()))
                            rotate.fringe.append(pair);
                    }
                }
                
                // If the graph search terminates because we returned to the first source, then the
                // pair list has to have a very particular shape.
                for (unsigned i = rotate.loop.size() - 1; i--;)
                    ASSERT(rotate.loop[i].dst() == rotate.loop[i + 1].src());
                rotates.append(WTFMove(rotate));
                currentPairs.resize(0);
            } else {
                if (verbose)
                    dataLog("It's a shift.\n");
                shifts.add(originalSrc, WTFMove(currentPairs));
            }
        }
    }

    if (verbose) {
        dataLog("Shifts:\n");
        for (auto& entry : shifts)
            dataLog("    ", entry.key, ": ", listDump(entry.value), "\n");
        dataLog("Rotates:\n");
        for (auto& rotate : rotates)
            dataLog("    loop = ", listDump(rotate.loop), ", fringe = ", listDump(rotate.fringe), "\n");
    }

    // In the worst case, we need two scratch registers. The way we do this is that the client passes
    // us what scratch registers he happens to have laying around. We will need scratch registers in
    // the following cases:
    //
    // - Shuffle pairs where both src and dst refer to memory.
    // - Rotate when no Swap instruction is available.
    //
    // Lucky for us, we are guaranteed to have extra scratch registers anytime we have a Shift that
    // ends with a register. We search for such a register right now.

    auto moveForWidth = [&] (Arg::Width width) -> Opcode {
        switch (width) {
        case Arg::Width32:
            return type == Arg::GP ? Move32 : MoveFloat;
        case Arg::Width64:
            return type == Arg::GP ? Move : MoveDouble;
        default:
            RELEASE_ASSERT_NOT_REACHED();
        }
    };

    Opcode conservativeMove = moveForWidth(Arg::conservativeWidth(type));

    // We will emit things in reverse. We maintain a list of packs of instructions, and then we emit
    // append them together in reverse (for example the thing at the end of resultPacks is placed
    // first). This is useful because the last thing we emit frees up its destination registers, so
    // it affects how we emit things before it.
    Vector<Vector<Inst>> resultPacks;
    Vector<Inst> result;

    auto commitResult = [&] () {
        resultPacks.append(WTFMove(result));
    };

    auto getScratch = [&] (unsigned index, Tmp possibleScratch) -> Tmp {
        if (scratches[index].isTmp())
            return scratches[index].tmp();

        if (!possibleScratch)
            return Tmp();
        result.append(Inst(conservativeMove, origin, possibleScratch, scratches[index]));
        return possibleScratch;
    };

    auto returnScratch = [&] (unsigned index, Tmp tmp) {
        if (Arg(tmp) != scratches[index])
            result.append(Inst(conservativeMove, origin, scratches[index], tmp));
    };

    auto handleShiftPair = [&] (const ShufflePair& pair, unsigned scratchIndex) {
        Opcode move = moveForWidth(pair.width());
        
        if (!isValidForm(move, pair.src().kind(), pair.dst().kind())) {
            Tmp scratch =
                getScratch(scratchIndex, findPossibleScratch(type, pair.src(), pair.dst()));
            RELEASE_ASSERT(scratch);
            if (isValidForm(move, pair.src().kind(), Arg::Tmp))
                result.append(Inst(moveForWidth(pair.width()), origin, pair.src(), scratch));
            else {
                ASSERT(pair.src().isSomeImm());
                ASSERT(move == Move32);
                result.append(Inst(Move, origin, Arg::bigImm(pair.src().value()), scratch));
            }
            result.append(Inst(moveForWidth(pair.width()), origin, scratch, pair.dst()));
            returnScratch(scratchIndex, scratch);
            return;
        }
        
        result.append(Inst(move, origin, pair.src(), pair.dst()));
    };

    auto handleShift = [&] (Vector<ShufflePair>& shift) {
        // FIXME: We could optimize the spill behavior of the shifter by checking if any of the
        // shifts need spills. If they do, then we could try to get a register out here. Note that
        // this may fail where the current strategy succeeds: out here we need a register that does
        // not interfere with any of the shifts, while the current strategy only needs to find a
        // scratch register that does not interfer with a particular shift. So, this optimization
        // will be opportunistic: if it succeeds, then the individual shifts can use that scratch,
        // otherwise they will do what they do now.
        
        for (unsigned i = shift.size(); i--;)
            handleShiftPair(shift[i], 0);

        Arg lastDst = shift.last().dst();
        if (lastDst.isTmp()) {
            for (Arg& scratch : scratches) {
                ASSERT(scratch != lastDst);
                if (!scratch.isTmp()) {
                    scratch = lastDst;
                    break;
                }
            }
        }
    };

    // First handle shifts whose last destination is a tmp because these free up scratch registers.
    // These end up last in the final sequence, so the final destination of these shifts will be
    // available as a scratch location for anything emitted prior (so, after, since we're emitting in
    // reverse).
    for (auto& entry : shifts) {
        Vector<ShufflePair>& shift = entry.value;
        if (shift.last().dst().isTmp())
            handleShift(shift);
        commitResult();
    }

    // Now handle the rest of the shifts.
    for (auto& entry : shifts) {
        Vector<ShufflePair>& shift = entry.value;
        if (!shift.last().dst().isTmp())
            handleShift(shift);
        commitResult();
    }

    for (Rotate& rotate : rotates) {
        if (!rotate.fringe.isEmpty()) {
            // Make sure we do the fringe first! This won't clobber any of the registers that are
            // part of the rotation.
            handleShift(rotate.fringe);
        }
        
        bool canSwap = false;
        Opcode swap = Oops;
        Arg::Width swapWidth = Arg::Width8; // bogus value

        // Currently, the swap instruction is not available for floating point on any architecture we
        // support.
        if (type == Arg::GP) {
            // Figure out whether we will be doing 64-bit swaps or 32-bit swaps. If we have a mix of
            // widths we handle that by fixing up the relevant register with zero-extends.
            swap = Swap32;
            swapWidth = Arg::Width32;
            bool hasMemory = false;
            bool hasIndex = false;
            for (ShufflePair& pair : rotate.loop) {
                switch (pair.width()) {
                case Arg::Width32:
                    break;
                case Arg::Width64:
                    swap = Swap64;
                    swapWidth = Arg::Width64;
                    break;
                default:
                    RELEASE_ASSERT_NOT_REACHED();
                    break;
                }

                hasMemory |= pair.src().isMemory() || pair.dst().isMemory();
                hasIndex |= pair.src().isIndex() || pair.dst().isIndex();
            }
            
            canSwap = isValidForm(swap, Arg::Tmp, Arg::Tmp);

            // We can totally use swaps even if there are shuffles involving memory. But, we play it
            // safe in that case. There are corner cases we don't handle, and our ability to do it is
            // contingent upon swap form availability.
            
            if (hasMemory) {
                canSwap &= isValidForm(swap, Arg::Tmp, Arg::Addr);
                
                // We don't take the swapping path if there is a mix of widths and some of the
                // shuffles involve memory. That gets too confusing. We might be able to relax this
                // to only bail if there are subwidth pairs involving memory, but I haven't thought
                // about it very hard. Anyway, this case is not common: rotates involving memory
                // don't arise for function calls, and they will only happen for rotates in user code
                // if some of the variables get spilled. It's hard to imagine a program that rotates
                // data around in variables while also doing a combination of uint32->uint64 and
                // int64->int32 casts.
                for (ShufflePair& pair : rotate.loop)
                    canSwap &= pair.width() == swapWidth;
            }

            if (hasIndex)
                canSwap &= isValidForm(swap, Arg::Tmp, Arg::Index);
        }

        if (canSwap) {
            for (unsigned i = rotate.loop.size() - 1; i--;) {
                Arg left = rotate.loop[i].src();
                Arg right = rotate.loop[i + 1].src();

                if (left.isMemory() && right.isMemory()) {
                    // Note that this is a super rare outcome. Rotates are rare. Spills are rare.
                    // Moving data between two spills is rare. To get here a lot of rare stuff has to
                    // all happen at once.
                    
                    Tmp scratch = getScratch(0, findPossibleScratch(type, left, right));
                    RELEASE_ASSERT(scratch);
                    result.append(Inst(moveForWidth(swapWidth), origin, left, scratch));
                    result.append(Inst(swap, origin, scratch, right));
                    result.append(Inst(moveForWidth(swapWidth), origin, scratch, left));
                    returnScratch(0, scratch);
                    continue;
                }

                if (left.isMemory())
                    std::swap(left, right);
                
                result.append(Inst(swap, origin, left, right));
            }

            for (ShufflePair pair : rotate.loop) {
                if (pair.width() == swapWidth)
                    continue;

                RELEASE_ASSERT(pair.width() == Arg::Width32);
                RELEASE_ASSERT(swapWidth == Arg::Width64);
                RELEASE_ASSERT(pair.dst().isTmp());

                // Need to do an extra zero extension.
                result.append(Inst(Move32, origin, pair.dst(), pair.dst()));
            }
        } else {
            // We can treat this as a shift so long as we take the last destination (i.e. first
            // source) and save it first. Then we handle the first entry in the pair in the rotate
            // specially, after we restore the last destination. This requires some special care to
            // find a scratch register. It's possible that we have a rotate that uses the entire
            // available register file.

            Tmp scratch = findPossibleScratch(
                type,
                [&] (Tmp tmp) -> bool {
                    for (ShufflePair pair : rotate.loop) {
                        if (pair.src().usesTmp(tmp))
                            return false;
                        if (pair.dst().usesTmp(tmp))
                            return false;
                    }
                    return true;
                });

            // NOTE: This is the most likely use of scratch registers.
            scratch = getScratch(0, scratch);

            // We may not have found a scratch register. When this happens, we can just use the spill
            // slot directly.
            Arg rotateSave = scratch ? Arg(scratch) : scratches[0];
            
            handleShiftPair(
                ShufflePair(rotate.loop.last().dst(), rotateSave, rotate.loop[0].width()), 1);

            for (unsigned i = rotate.loop.size(); i-- > 1;)
                handleShiftPair(rotate.loop[i], 1);

            handleShiftPair(
                ShufflePair(rotateSave, rotate.loop[0].dst(), rotate.loop[0].width()), 1);

            if (scratch)
                returnScratch(0, scratch);
        }

        commitResult();
    }

    ASSERT(result.isEmpty());

    for (unsigned i = resultPacks.size(); i--;)
        result.appendVector(resultPacks[i]);

    return result;
}

Vector<Inst> emitShuffle(
    const Vector<ShufflePair>& pairs,
    const std::array<Arg, 2>& gpScratch, const std::array<Arg, 2>& fpScratch,
    Value* origin)
{
    Vector<ShufflePair> gpPairs;
    Vector<ShufflePair> fpPairs;
    for (const ShufflePair& pair : pairs) {
        if (pair.src().isMemory() && pair.dst().isMemory() && pair.width() > Arg::pointerWidth()) {
            // 8-byte memory-to-memory moves on a 32-bit platform are best handled as float moves.
            fpPairs.append(pair);
        } else if (pair.src().isGP() && pair.dst().isGP()) {
            // This means that gpPairs gets memory-to-memory shuffles. The assumption is that we
            // can do that more efficiently using GPRs, except in the special case above.
            gpPairs.append(pair);
        } else
            fpPairs.append(pair);
    }

    Vector<Inst> result;
    result.appendVector(emitShuffle(gpPairs, gpScratch, Arg::GP, origin));
    result.appendVector(emitShuffle(fpPairs, fpScratch, Arg::FP, origin));
    return result;
}

} } } // namespace JSC::B3::Air

#endif // ENABLE(B3_JIT)