//-- transpose.h - Byte/Nibble transpose for further compressing with lz77 or other compressors ------------------------------------- #ifdef __cplusplus extern "C" { #endif // Syntax // in : Input buffer // n : Total number of bytes in input buffer // out : output buffer // esize : element size in bytes (ex. 2, 4, 8,... ) //---------- High level functions with dynamic cpu detection and JIT scalar/sse/avx2 switching void tpenc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // tranpose void tpdec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // reverse transpose void tpzenc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // zigzag integrated void tpzdec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); void tpxenc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // xor integrated void tpxdec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); //2D transpose void tp2denc( unsigned char *in, unsigned nx, unsigned ny, unsigned char *out, unsigned esize); void tp2ddec( unsigned char *in, unsigned nx, unsigned ny, unsigned char *out, unsigned esize); void tp2denc2( unsigned char *in, unsigned nx, unsigned ny, unsigned char *out); void tp2ddec2( unsigned char *in, unsigned nx, unsigned ny, unsigned char *out); void tp2denc4( unsigned char *in, unsigned nx, unsigned ny, unsigned char *out); void tp2ddec4( unsigned char *in, unsigned nx, unsigned ny, unsigned char *out); void tp2denc8( unsigned char *in, unsigned nx, unsigned ny, unsigned char *out); void tp2ddec8( unsigned char *in, unsigned nx, unsigned ny, unsigned char *out); //3D transpose void tp3denc( unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize); void tp3ddec( unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize); void tp3denc2( unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out); void tp3ddec2( unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out); void tp3denc4( unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out); void tp3ddec4( unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out); void tp3denc8( unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out); void tp3ddec8( unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out); //4D transpose void tp4denc( unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize); void tp4ddec( unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize); void tp4denc2( unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out); void tp4ddec2( unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out); void tp4denc4( unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out); void tp4ddec4( unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out); void tp4denc8( unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out); void tp4ddec8( unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out); // Nibble transpose SIMD (SSE2,AVX2, ARM Neon) void tp4enc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); void tp4dec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); void tp4zenc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // zigzag delta integrated void tp4zdec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); void tp4xenc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // xor integrated void tp4xdec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); //---------- Low level functions -------------------------------------------------------------------------------------------- void tpenc2( unsigned char *in, unsigned n, unsigned char *out); // scalar void tpdec2( unsigned char *in, unsigned n, unsigned char *out); void tpenc3( unsigned char *in, unsigned n, unsigned char *out); void tpdec3( unsigned char *in, unsigned n, unsigned char *out); void tpenc4( unsigned char *in, unsigned n, unsigned char *out); void tpdec4( unsigned char *in, unsigned n, unsigned char *out); void tpenc8( unsigned char *in, unsigned n, unsigned char *out); void tpdec8( unsigned char *in, unsigned n, unsigned char *out); void tpenc16( unsigned char *in, unsigned n, unsigned char *out); void tpdec16( unsigned char *in, unsigned n, unsigned char *out); void tpzenc2( unsigned char *in, unsigned n, unsigned char *out); // scalar zigzag void tpzdec2( unsigned char *in, unsigned n, unsigned char *out); void tpzenc3( unsigned char *in, unsigned n, unsigned char *out); void tpzdec3( unsigned char *in, unsigned n, unsigned char *out); void tpzenc4( unsigned char *in, unsigned n, unsigned char *out); void tpzdec4( unsigned char *in, unsigned n, unsigned char *out); void tpzenc8( unsigned char *in, unsigned n, unsigned char *out); void tpzdec8( unsigned char *in, unsigned n, unsigned char *out); void tpzenc16( unsigned char *in, unsigned n, unsigned char *out); void tpzdec16( unsigned char *in, unsigned n, unsigned char *out); void tpxenc2( unsigned char *in, unsigned n, unsigned char *out); // scalar xor void tpxdec2( unsigned char *in, unsigned n, unsigned char *out); void tpxenc3( unsigned char *in, unsigned n, unsigned char *out); void tpxdec3( unsigned char *in, unsigned n, unsigned char *out); void tpxenc4( unsigned char *in, unsigned n, unsigned char *out); void tpxdec4( unsigned char *in, unsigned n, unsigned char *out); void tpxenc8( unsigned char *in, unsigned n, unsigned char *out); void tpxdec8( unsigned char *in, unsigned n, unsigned char *out); void tpxenc16( unsigned char *in, unsigned n, unsigned char *out); void tpxdec16( unsigned char *in, unsigned n, unsigned char *out); //-- byte transpose - sse --------- void tpenc128v2( unsigned char *in, unsigned n, unsigned char *out); // 16 bits void tpdec128v2( unsigned char *in, unsigned n, unsigned char *out); void tpzenc128v2( unsigned char *in, unsigned n, unsigned char *out); void tpzdec128v2( unsigned char *in, unsigned n, unsigned char *out); void tpxenc128v2( unsigned char *in, unsigned n, unsigned char *out); void tpxdec128v2( unsigned char *in, unsigned n, unsigned char *out); //-- void tpenc128v4( unsigned char *in, unsigned n, unsigned char *out); // 32 bits void tpdec128v4( unsigned char *in, unsigned n, unsigned char *out); void tpzenc128v4( unsigned char *in, unsigned n, unsigned char *out); void tpzdec128v4( unsigned char *in, unsigned n, unsigned char *out); void tpxenc128v4( unsigned char *in, unsigned n, unsigned char *out); void tpxdec128v4( unsigned char *in, unsigned n, unsigned char *out); void tpenc128v8( unsigned char *in, unsigned n, unsigned char *out); // 64 bits void tpdec128v8( unsigned char *in, unsigned n, unsigned char *out); void tpzenc128v8( unsigned char *in, unsigned n, unsigned char *out); void tpzdec128v8( unsigned char *in, unsigned n, unsigned char *out); void tpxenc128v8( unsigned char *in, unsigned n, unsigned char *out); void tpxdec128v8( unsigned char *in, unsigned n, unsigned char *out); //-- nibble transpose void tp4enc128v2( unsigned char *in, unsigned n, unsigned char *out); // 16 bits void tp4dec128v2( unsigned char *in, unsigned n, unsigned char *out); void tp4zenc128v2( unsigned char *in, unsigned n, unsigned char *out); void tp4zdec128v2( unsigned char *in, unsigned n, unsigned char *out); void tp4xenc128v2( unsigned char *in, unsigned n, unsigned char *out); void tp4xdec128v2( unsigned char *in, unsigned n, unsigned char *out); void tp4enc128v4( unsigned char *in, unsigned n, unsigned char *out); // 32 bits void tp4dec128v4( unsigned char *in, unsigned n, unsigned char *out); void tp4zenc128v4( unsigned char *in, unsigned n, unsigned char *out); void tp4zdec128v4( unsigned char *in, unsigned n, unsigned char *out); void tp4xenc128v4( unsigned char *in, unsigned n, unsigned char *out); void tp4xdec128v4( unsigned char *in, unsigned n, unsigned char *out); void tp4enc128v8( unsigned char *in, unsigned n, unsigned char *out); // 64 bits void tp4dec128v8( unsigned char *in, unsigned n, unsigned char *out); void tp4zenc128v8( unsigned char *in, unsigned n, unsigned char *out); void tp4zdec128v8( unsigned char *in, unsigned n, unsigned char *out); void tp4xenc128v8( unsigned char *in, unsigned n, unsigned char *out); void tp4xdec128v8( unsigned char *in, unsigned n, unsigned char *out); //-- avx2 //-- byte transpose void tpenc256v2( unsigned char *in, unsigned n, unsigned char *out); //-- 16 bits void tpdec256v2( unsigned char *in, unsigned n, unsigned char *out); void tpenc256v4( unsigned char *in, unsigned n, unsigned char *out); //-- 32 bits void tpdec256v4( unsigned char *in, unsigned n, unsigned char *out); void tpzenc256v4( unsigned char *in, unsigned n, unsigned char *out); // zigzag void tpzdec256v4( unsigned char *in, unsigned n, unsigned char *out); void tpxenc256v4( unsigned char *in, unsigned n, unsigned char *out); // xor void tpxdec256v4( unsigned char *in, unsigned n, unsigned char *out); void tpenc256v8( unsigned char *in, unsigned n, unsigned char *out); //-- 64 bits void tpdec256v8( unsigned char *in, unsigned n, unsigned char *out); void tpzenc256v8( unsigned char *in, unsigned n, unsigned char *out); // zigzag void tpzdec256v8( unsigned char *in, unsigned n, unsigned char *out); void tpxenc256v8( unsigned char *in, unsigned n, unsigned char *out); // xor void tpxdec256v8( unsigned char *in, unsigned n, unsigned char *out); //-- Nibble transpose void tp4enc256v2( unsigned char *in, unsigned n, unsigned char *out); //-- 16 bits void tp4dec256v2( unsigned char *in, unsigned n, unsigned char *out); void tp4enc256v4( unsigned char *in, unsigned n, unsigned char *out); //-- 32 bits void tp4dec256v4( unsigned char *in, unsigned n, unsigned char *out); void tp4zenc256v4( unsigned char *in, unsigned n, unsigned char *out); // zigzag void tp4zdec256v4( unsigned char *in, unsigned n, unsigned char *out); void tp4xenc256v4( unsigned char *in, unsigned n, unsigned char *out); // xor void tp4xdec256v4( unsigned char *in, unsigned n, unsigned char *out); void tp4enc256v8( unsigned char *in, unsigned n, unsigned char *out); // 64 bits void tp4dec256v8( unsigned char *in, unsigned n, unsigned char *out); void tp4zenc256v8( unsigned char *in, unsigned n, unsigned char *out); // zigzag void tp4zdec256v8( unsigned char *in, unsigned n, unsigned char *out); void tp4xenc256v8( unsigned char *in, unsigned n, unsigned char *out); // xor void tp4xdec256v8( unsigned char *in, unsigned n, unsigned char *out); //------- CPU instruction set // cpuiset = 0: return current simd set, // cpuiset != 0: set simd set 0:scalar, 20:sse2, 52:avx2 unsigned cpuini(unsigned cpuiset); // convert simd set to string "sse3", "sse3", "sse4.1" or "avx2" // Ex.: printf("current cpu set=%s\n", cpustr(cpuini(0)) ); char *cpustr(unsigned cpuisa); unsigned cpuisa(void); #ifdef __cplusplus } #endif