// MIT License
//
// Copyright (c) 2018 Xiao Wang (wangxiao@gmail.com)
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// Enquiries about further applications and development opportunities are
// welcome.

#include <assert.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include <stdint.h>
#include <stdlib.h>
#include <wmmintrin.h>
#include <xmmintrin.h>

#define INP(x, y) inp[(x)*ncols / 8 + (y) / 8]
#define OUT(x, y) out[(y)*nrows / 8 + (x) / 8]

// Modified from
// https://mischasan.wordpress.com/2011/10/03/the-full-sse2-bit-matrix-transpose-routine/
// with inner most loops changed to _mm_set_epi8 and _mm_set_epi16
void sse_trans(uint8_t *out, uint8_t const *inp, uint64_t nrows,
               uint64_t ncols) {
    uint64_t rr, cc;
    int i, h;
    union {
        __m128i x;
        uint8_t b[16];
    } tmp;
    __m128i vec;
    assert(nrows % 8 == 0 && ncols % 8 == 0);

    // Do the main body in 16x8 blocks:
    for (rr = 0; rr <= nrows - 16; rr += 16) {
        for (cc = 0; cc < ncols; cc += 8) {
            vec = _mm_set_epi8(INP(rr + 15, cc), INP(rr + 14, cc), INP(rr + 13, cc),
                               INP(rr + 12, cc), INP(rr + 11, cc), INP(rr + 10, cc),
                               INP(rr + 9, cc), INP(rr + 8, cc), INP(rr + 7, cc),
                               INP(rr + 6, cc), INP(rr + 5, cc), INP(rr + 4, cc),
                               INP(rr + 3, cc), INP(rr + 2, cc), INP(rr + 1, cc),
                               INP(rr + 0, cc));
            for (i = 8; --i >= 0; vec = _mm_slli_epi64(vec, 1))
                *(uint16_t *)&OUT(rr, cc + i) = _mm_movemask_epi8(vec);
        }
    }
    if (rr == nrows)
        return;

    // The remainder is a block of 8x(16n+8) bits (n may be 0).
    //  Do a PAIR of 8x8 blocks in each step:
    if ((ncols % 8 == 0 && ncols % 16 != 0) ||
        (nrows % 8 == 0 && nrows % 16 != 0)) {
        // The fancy optimizations in the else-branch don't work if the above if-condition
        // holds, so we use the simpler non-simd variant for that case.
        for (cc = 0; cc <= ncols - 16; cc += 16) {
            for (i = 0; i < 8; ++i) {
                tmp.b[i] = h = *(uint16_t const *)&INP(rr + i, cc);
                tmp.b[i + 8] = h >> 8;
            }
            for (i = 8; --i >= 0; tmp.x = _mm_slli_epi64(tmp.x, 1)) {
                OUT(rr, cc + i) = h = _mm_movemask_epi8(tmp.x);
                OUT(rr, cc + i + 8) = h >> 8;
            }
        }
    } else {
        for (cc = 0; cc <= ncols - 16; cc += 16) {
            vec = _mm_set_epi16(*(uint16_t const *)&INP(rr + 7, cc),
                    *(uint16_t const *)&INP(rr + 6, cc),
                    *(uint16_t const *)&INP(rr + 5, cc),
                    *(uint16_t const *)&INP(rr + 4, cc),
                    *(uint16_t const *)&INP(rr + 3, cc),
                    *(uint16_t const *)&INP(rr + 2, cc),
                    *(uint16_t const *)&INP(rr + 1, cc),
                    *(uint16_t const *)&INP(rr + 0, cc));
            for (i = 8; --i >= 0; vec = _mm_slli_epi64(vec, 1)) {
                OUT(rr, cc + i) = h = _mm_movemask_epi8(vec);
                OUT(rr, cc + i + 8) = h >> 8;
            }
        }
    }
    if (cc == ncols)
        return;

    //  Do the remaining 8x8 block:
    for (i = 0; i < 8; ++i)
        tmp.b[i] = INP(rr + i, cc);
    for (i = 8; --i >= 0; tmp.x = _mm_slli_epi64(tmp.x, 1))
        OUT(rr, cc + i) = _mm_movemask_epi8(tmp.x);
}