/* Runtime detection of optional processor characteristics.
 * 
 * Contents:
 *   1. Checking for support of x86 vector code
 *   2. Internal code used in those checks
 *   3. Unit tests
 *   4. Test driver
 *   5. Example
 *
 * References:
 *   https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
 *   https://software.intel.com/en-us/articles/how-to-detect-knl-instruction-support
 *   https://en.wikipedia.org/wiki/CPUID
 */
#include <esl_config.h>

#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
#if defined(_MSC_VER)
#include <intrin.h>
#endif

#include "easel.h"
#include "esl_cpu.h"

/* declarations of static functions that come in section (2)  */
#if defined(eslENABLE_SSE) || defined(eslENABLE_SSE4) || defined(eslENABLE_AVX) || defined(eslENABLE_AVX512)
static void cpu_run_id(uint32_t eax, uint32_t ecx, uint32_t *abcd);
#endif
#ifdef eslENABLE_SSE
static int  cpu_has_sse(void);
#endif
#ifdef eslENABLE_SSE4
static int  cpu_has_sse4(void);
#endif
#ifdef eslENABLE_AVX
static int  cpu_check_xcr0_ymm(void);
static int  cpu_has_avx(void);
#endif
#ifdef eslENABLE_AVX512
static int  cpu_check_xcr0_zmm(void);
static int  cpu_has_avx512(void);
#endif

/*****************************************************************
 * 1. Checking for support of x86 vector code
 *****************************************************************/

/* Function:  esl_cpu_has_sse()
 * Synopsis:  Check if processor supports x86 SSE/SSE2
 * Incept:    SRE, Wed Feb  1 09:19:11 2017
 *
 * Purpose:   Returns TRUE if our code has an available SSE vector
 *            implementation compiled in, and the processor we're
 *            running on can support it (i.e. has SSE+SSE2).
 *            Else returns FALSE.
 * 
 * Note:      Although these use static flags, they are thread-safe.  
 *            They can only go in one direction, from a not-set-yet 
 *            state to a set state. Worst that happens in a race
 *            condition is that we set the flag twice to the same
 *            thing.
 */
int
esl_cpu_has_sse(void)
{
#ifdef eslENABLE_SSE
  static int sse_support = -1;
  if (sse_support < 0)
    sse_support = cpu_has_sse();
  return sse_support;
#else
  return 0;
#endif
}


/* Function:  esl_cpu_has_sse4()
 * Synopsis:  Check if processor supports x86 <= SSE4.1
 * Incept:    SRE, Wed Jun  6 11:49:46 2018 [OdjBox, Otto Croy]
 *
 * Purpose:   Returns TRUE if our code has an available SSE4 vector
 *            implementation compiled in, and the processor we're
 *            running on can support it (i.e. has SSE+SSE2+SSE4.1).
 *            Else returns FALSE.
 */
int
esl_cpu_has_sse4(void)
{
#ifdef eslENABLE_SSE4
  static int sse4_support = -1;
  if (sse4_support < 0)
    sse4_support = cpu_has_sse4();
  return sse4_support;
#else
  return 0;
#endif
}



/* Function:  esl_cpu_has_avx()
 * Synopsis:  Check if processor supports x86 AVX/AVX2.
 * Incept:    SRE, Wed Feb  1 09:46:36 2017
 *
 * Purpose:   Returns TRUE if our code has an available AVX vector
 *            implementation compiled in, and the processor we're
 *            running on can support it (i.e. has AVX+AVX2).  Else
 *            returns FALSE.
 */
int
esl_cpu_has_avx(void)
{
#ifdef eslENABLE_AVX 
  static int avx_support = -1;
  if (avx_support < 0)
    avx_support = cpu_has_avx();
  return avx_support;
#else
  return 0;
#endif
}

/* Function:  esl_cpu_has_avx512()
 * Synopsis:  Check if processor supports x86 AVX-512.
 * Incept:    SRE, Wed Feb  1 09:47:24 2017
 *
 * Purpose:   Returns TRUE if our code has an available AVX512 vector
 *            implementation compiled in, and the processor we're
 *            running on can support it (i.e. has
 *            AVX-512{F,PF,ER,CD,BW}). Else returns FALSE.
 */
int
esl_cpu_has_avx512(void)
{
#ifdef eslENABLE_AVX512
  static int avx512_support = -1;
  if (avx512_support < 0)
    avx512_support = cpu_has_avx512();
  return avx512_support;
#else
  return 0;
#endif
}



/* Function:  esl_cpu_Get()
 * Synopsis:  Returns a string showing which implementation our dispatchers choose.
 * Incept:    SRE, Tue May 23 12:30:37 2017 [Handsome Family, Winnebago Skeletons]
 *
 * Purpose:   Return a string indicating which vector implementation is
 *            chosen by our dispatchers, assuming they follow our
 *            standard pattern.
 */
char *
esl_cpu_Get(void)
{
#ifdef eslENABLE_AVX512  // Fastest first.
  if (esl_cpu_has_avx512()) return "AVX512";
#endif
#ifdef eslENABLE_AVX
  if (esl_cpu_has_avx())    return "AVX";
#endif
#ifdef eslENABLE_SSE4
  if (esl_cpu_has_sse4())   return "SSE4";
#endif
#ifdef eslENABLE_SSE
  if (esl_cpu_has_sse())    return "SSE";
#endif
#ifdef eslENABLE_NEON
  return "NEON";
#endif
//#ifdef eslENABLE_VMX
//  return "VMX";
//#endif
  return "none";
}
/*---------- end, API for x86 vector instruction checks ---------*/



/*****************************************************************
 * 2. Internal code used in x86 vector code checks
 *****************************************************************/

#if defined(eslENABLE_SSE) || defined(eslENABLE_SSE4) || defined(eslENABLE_AVX) || defined(eslENABLE_AVX512)
/* cpu_run_id()
 *
 * Bit flags in EAX (and maybe ECX) registers specify the information
 * you want to query from the x86 processor. The cpuid opcode returns
 * results by setting bits in EAX, EBX, ECX, EDX registers, which we
 * return in abcd[0..3], respectively. 
 * 
 * [What all the bits mean](https://en.wikipedia.org/wiki/CPUID)
 *
 * Adapted from run_cpuid() in:
 * https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
 */
static void 
cpu_run_id(uint32_t eax, uint32_t ecx, uint32_t *abcd)
{
#if defined(_MSC_VER)
  __cpuidex(abcd, eax, ecx);
#else
  uint32_t ebx = 0;
  uint32_t edx = 0;
#if defined( __i386__ ) && defined ( __PIC__ )   /* in case of PIC under 32-bit EBX cannot be clobbered */
  __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx), "+a" (eax), "+c" (ecx), "=d" (edx) );
#else
  __asm__ ( "cpuid" : "+b" (ebx), "+a" (eax), "+c" (ecx), "=d" (edx) );
#endif
  abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
#endif // ! _MSC_VER
}     
#endif // eslENABLE_SSE | eslENABLE_SSE4 | eslENABLE_AVX | eslENABLE_AVX512



#ifdef eslENABLE_AVX
/* cpu_check_xcr0_ymm()
 *
 * Check for OS support of AVX. AVX uses the YMM registers, and the
 * operating system must support saving YMM state on a context switch.
 * The check depends on the `xgetbv` intrinsic on x86 processors.
 *
 * xgetbv's result has set:
 *   bits 7<<5 = zmm (AVX-512)
 *   bit  1<<2 = ymm (AVX)
 *   bit  1<<1 = xmm
 *
 * Some Mac OS/X assemblers do not recognize the xgetbv instruction,
 * but you can still emit the raw byte codes for it. So instead of 
 *   __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
 * we have
 *   __asm__(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx" );
 */
static int 
cpu_check_xcr0_ymm(void) 
{
  uint32_t xcr0;
  uint32_t ymm_xmm = (1 << 2) | (1 << 1);
#if defined(_MSC_VER)
  xcr0 = (uint32_t)_xgetbv(0);  /* min VS2010 SP1 compiler is required */
#else
  __asm__(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx" );
#endif
  return ((xcr0 & ymm_xmm) == ymm_xmm); 
}
#endif


#ifdef eslENABLE_AVX512
/* cpu_check_xcr0_zmm()
 * 
 * Similarly, check for OS support of AVX-512, which uses ZMM and YMM registers.
 */
static int 
cpu_check_xcr0_zmm(void) 
{
  uint32_t xcr0;
  uint32_t zmm_ymm_xmm = (7 << 5) | (1 << 2) | (1 << 1);
#if defined(_MSC_VER)
  xcr0 = (uint32_t)_xgetbv(0);  /* min VS2010 SP1 compiler is required */
#else
  __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx" );
#endif
  return ((xcr0 & zmm_ymm_xmm) == zmm_ymm_xmm); 
}
#endif


#ifdef eslENABLE_SSE
/* cpu_has_sse()
 * 
 * Test whether processor supports SSE/SSE2 instructions.
 * Note that Easel's "SSE" vector code means SSE+SSE2.
 */
static int
cpu_has_sse(void)
{
  uint32_t abcd[4];
  uint32_t sse2_mask =  (1 << 25) |  // edx: SSE
                        (1 << 26);   //      SSE2

  cpu_run_id( 1, 0, abcd );
  if ( (abcd[3] & sse2_mask)  != sse2_mask)  // edx check
    return 0;
  return 1;
}
#endif // eslENABLE_SSE


#ifdef eslENABLE_SSE4
/* cpu_has_sse4()
 * 
 * Test whether processor supports SSE/SSE2/SSE4.1 instructions.
 * Note that Easel's "SSE4" vector code means SSE+SSE2+SSE4.1.
 */
static int
cpu_has_sse4(void)
{
  uint32_t abcd[4];
  uint32_t sse2_mask =  (1 << 25) |  // edx: SSE
                        (1 << 26);   //      SSE2
  uint32_t sse41_mask = (1 << 19);   // ecx: SSE4.1

  cpu_run_id( 1, 0, abcd );
  if ( (abcd[3] & sse2_mask)  != sse2_mask || // edx check
       (abcd[2] & sse41_mask) != sse41_mask)  // ecx check
    return 0;
  return 1;
}
#endif // eslENABLE_SSE4



#ifdef eslENABLE_AVX
/* cpu_has_avx
 * 
 * Test whether processor supports AVX/AVX2 instructions.
 * Easel "AVX" vector code requires AVX+AVX2.
 */
static int 
cpu_has_avx(void)
{
  uint32_t abcd[4];
  uint32_t fma_movbe_osxsave_mask = ((1 << 12) | (1 << 22) | (1 << 27));
  uint32_t avx2_bmi12_mask = (1 << 5) | (1 << 3) | (1 << 8);

  /* CPUID.(EAX=01H, ECX=0H):ECX.FMA[bit 12]==1   && 
     CPUID.(EAX=01H, ECX=0H):ECX.MOVBE[bit 22]==1 && 
     CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 */
  cpu_run_id( 1, 0, abcd );
  if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask ) 
    return 0;

  if ( ! cpu_check_xcr0_ymm() )
    return 0;

  /*  CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1  &&
      CPUID.(EAX=07H, ECX=0H):EBX.BMI1[bit 3]==1  &&
      CPUID.(EAX=07H, ECX=0H):EBX.BMI2[bit 8]==1  */
  cpu_run_id( 7, 0, abcd );
  if ( (abcd[1] & avx2_bmi12_mask) != avx2_bmi12_mask ) 
    return 0;

  /* CPUID.(EAX=80000001H):ECX.LZCNT[bit 5]==1 */
  cpu_run_id( 0x80000001, 0, abcd );
  if ( (abcd[2] & (1 << 5)) == 0)
    return 0;
  
  return 1;
}
#endif // eslENABLE_AVX


#ifdef eslENABLE_AVX512
/* cpu_has_avx512()
 * 
 * Test whether processors supports AVX-512.  Our AVX-512 code
 * currently can depend on Foundation, Double/Quadword, and Byte/Word
 * subsets (F, DQ, BW), and requires Intel Skylake Xeon (Purley)
 * processors or later. 
 */
static int 
cpu_has_avx512(void) 
{
  uint32_t abcd[4];
  uint32_t osxsave_mask = (1 << 27);  
  uint32_t knl_mask     = (1 << 16) | // AVX-512F
                          (1 << 17) | // AVX-512DQ
                          (1 << 30);  // AVX-512BW

  cpu_run_id( 1, 0, abcd );
  if ( (abcd[2] & osxsave_mask) != osxsave_mask ) 
    return 0;

  if ( ! cpu_check_xcr0_zmm() )
    return 0;
  
  cpu_run_id( 7, 0, abcd );
  if ( (abcd[1] & knl_mask) != knl_mask ) 
    return 0;

  return 1;
}
#endif // eslENABLE_AVX512


/*------------ end, x86 processor interrogation -----------------*/




/*****************************************************************
 * 3. Unit tests
 *****************************************************************/

#ifdef eslCPU_TESTDRIVE


/* utest_consistency()
 * 
 * If we support AVX-512, we must support AVX; if we support AVX, we
 * must support SSE. This isn't a strong test of anything, but since
 * we don't know anything about the processor we're running unit
 * testing on, it's hard to guarantee any stronger test.
 * 
 * #ifdef's are required, because Easel applications are allowed
 * to define any subset of vector implementations they want;
 * for example, H4 implements SSE4 but not SSE.
 */
static void
utest_consistency(void)
{
  // it's possible that none of the `#if defined` blocks are used, so
  // don't put a char msg[] here, or compiler could bark about it being unused.
#if defined (eslENABLE_AVX512) && defined (eslENABLE_AVX)
  if (esl_cpu_has_avx512() && ! esl_cpu_has_avx())  esl_fatal("utest_consistency() failed");
#endif
#if defined (eslENABLE_AVX) && defined (eslENABLE_SSE4)
  if (esl_cpu_has_avx()    && ! esl_cpu_has_sse4()) esl_fatal("utest_consistency() failed");
#endif
#if defined (eslENABLE_SSE4) && defined (eslENABLE_SSE)
  if (esl_cpu_has_sse4()   && ! esl_cpu_has_sse())  esl_fatal("utest_consistency() failed");
#endif
}

#endif // eslCPU_TESTDRIVE


/*****************************************************************
 * 4. Test driver
 *****************************************************************/
#ifdef eslCPU_TESTDRIVE

int
main(int argc, char **argv)
{
  fprintf(stderr, "## %s\n", argv[0]);

  utest_consistency();

  fprintf(stderr, "#  status = ok\n");
  return eslOK;
}
#endif // eslCPU_TESTDRIVE


/*****************************************************************
 * 5. Example
 *****************************************************************/
#ifdef eslCPU_EXAMPLE

#include <esl_config.h>

#include "easel.h"
#include "esl_cpu.h"

int 
main(int argc, char **argv)
{
  printf("your cpu supports our SSE code    : %s\n",  esl_cpu_has_sse()    ? "yes" : "no");
  printf("               ...our SSE4 code   : %s\n",  esl_cpu_has_sse4()   ? "yes" : "no");
  printf("               ...our AVX code    : %s\n",  esl_cpu_has_avx()    ? "yes" : "no");
  printf("               ...our AVX512 code : %s\n",  esl_cpu_has_avx512() ? "yes" : "no");
  printf("Our dispatchers will choose       : %s\n",  esl_cpu_Get());
}
#endif // eslCPU_EXAMPLE