/*

   BLIS
   An object-based framework for developing high-performance BLAS-like
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
   met:
    - Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    - Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    - Neither the name(s) of the copyright holder(s) nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

#include "blis.h"
#include <fenv.h>

//
// Define BLAS-like interfaces with typed operands.
//

#undef  GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
     ( \
       dim_t    n, \
       ctype*   x, inc_t incx, \
       ctype_r* asum, \
       cntx_t*  cntx, \
       rntm_t*  rntm  \
     ) \
{ \
	ctype*  chi1; \
	ctype_r chi1_r; \
	ctype_r chi1_i; \
	ctype_r absum; \
	dim_t   i; \
\
	/* Initialize the absolute sum accumulator to zero. */ \
	PASTEMAC(chr,set0s)( absum ); \
\
	for ( i = 0; i < n; ++i ) \
	{ \
		chi1 = x + (i  )*incx; \
\
		/* Get the real and imaginary components of chi1. */ \
		PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
\
		/* Replace chi1_r and chi1_i with their absolute values. */ \
		chi1_r = bli_fabs( chi1_r ); \
		chi1_i = bli_fabs( chi1_i ); \
\
		/* Accumulate the real and imaginary components into absum. */ \
		PASTEMAC(chr,adds)( chi1_r, absum ); \
		PASTEMAC(chr,adds)( chi1_i, absum ); \
	} \
\
	/* Store the final value of absum to the output variable. */ \
	PASTEMAC(chr,copys)( absum, *asum ); \
}

INSERT_GENTFUNCR_BASIC( asumv_unb_var1 )


#undef  GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
     ( \
       uplo_t  uploa, \
       dim_t   m, \
       ctype*  a, inc_t rs_a, inc_t cs_a, \
       cntx_t* cntx, \
       rntm_t* rntm  \
     ) \
{ \
	ctype_r* zeror = PASTEMAC(chr,0); \
	doff_t   diagoffa; \
\
	/* If the dimension is zero, return early. */ \
	if ( bli_zero_dim1( m ) ) return; \
\
	/* In order to avoid the main diagonal, we must nudge the diagonal either
	   up or down by one, depending on which triangle is currently stored. */ \
	if        ( bli_is_upper( uploa ) )   diagoffa =  1; \
	else /*if ( bli_is_lower( uploa ) )*/ diagoffa = -1; \
\
	/* We will be reflecting the stored region over the diagonal into the
	   unstored region, so a transposition is necessary. Furthermore, since
	   we are creating a Hermitian matrix, we must also conjugate. */ \
	PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
	( \
	  diagoffa, \
	  BLIS_NONUNIT_DIAG, \
	  uploa, \
	  BLIS_CONJ_TRANSPOSE, \
	  m, \
	  m, \
	  a, rs_a, cs_a, \
	  a, rs_a, cs_a, \
	  cntx, \
	  rntm  \
	); \
\
	/* Set the imaginary parts of the diagonal elements to zero. */ \
	PASTEMAC2(ch,setid,BLIS_TAPI_EX_SUF) \
	( \
	  0, \
	  m, \
	  m, \
	  zeror, \
	  a, rs_a, cs_a, \
	  cntx, \
	  rntm  \
	); \
}

INSERT_GENTFUNCR_BASIC( mkherm_unb_var1 )


#undef  GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
     ( \
       uplo_t  uploa, \
       dim_t   m, \
       ctype*  a, inc_t rs_a, inc_t cs_a, \
       cntx_t* cntx, \
       rntm_t* rntm  \
     ) \
{ \
	doff_t  diagoffa; \
\
	/* If the dimension is zero, return early. */ \
	if ( bli_zero_dim1( m ) ) return; \
\
	/* In order to avoid the main diagonal, we must nudge the diagonal either
	   up or down by one, depending on which triangle is currently stored. */ \
	if        ( bli_is_upper( uploa ) )   diagoffa =  1; \
	else /*if ( bli_is_lower( uploa ) )*/ diagoffa = -1; \
\
	/* We will be reflecting the stored region over the diagonal into the
	   unstored region, so a transposition is necessary. */ \
	PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
	( \
	  diagoffa, \
	  BLIS_NONUNIT_DIAG, \
	  uploa, \
	  BLIS_TRANSPOSE, \
	  m, \
	  m, \
	  a, rs_a, cs_a, \
	  a, rs_a, cs_a, \
	  cntx, \
	  rntm  \
	); \
}

INSERT_GENTFUNC_BASIC( mksymm_unb_var1 )


#undef  GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
\
void PASTEMAC(ch,varname) \
     ( \
       uplo_t  uploa, \
       dim_t   m, \
       ctype*  a, inc_t rs_a, inc_t cs_a, \
       cntx_t* cntx, \
       rntm_t* rntm  \
     ) \
{ \
	ctype*  zero = PASTEMAC(ch,0); \
	doff_t  diagoffa; \
\
	/* If the dimension is zero, return early. */ \
	if ( bli_zero_dim1( m ) ) return; \
\
	/* Toggle uplo so that it refers to the unstored triangle. */ \
	bli_toggle_uplo( &uploa ); \
\
	/* In order to avoid the main diagonal, we must nudge the diagonal either
	   up or down by one, depending on which triangle is to be zeroed. */ \
	if        ( bli_is_upper( uploa ) )   diagoffa =  1; \
	else /*if ( bli_is_lower( uploa ) )*/ diagoffa = -1; \
\
	/* Set the unstored triangle to zero. */ \
	PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
	( \
	  BLIS_NO_CONJUGATE, \
	  diagoffa, \
	  BLIS_NONUNIT_DIAG, \
	  uploa, \
	  m, \
	  m, \
	  zero, \
	  a, rs_a, cs_a, \
	  cntx, \
	  rntm  \
	); \
}

INSERT_GENTFUNC_BASIC( mktrim_unb_var1 )


#undef  GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
     ( \
       dim_t    n, \
       ctype*   x, inc_t incx, \
       ctype_r* norm, \
       cntx_t*  cntx, \
       rntm_t*  rntm  \
     ) \
{ \
	ctype*  chi1; \
	ctype_r abs_chi1; \
	ctype_r absum; \
	dim_t   i; \
\
	/* Initialize the absolute sum accumulator to zero. */ \
	PASTEMAC(chr,set0s)( absum ); \
\
	for ( i = 0; i < n; ++i ) \
	{ \
		chi1 = x + (i  )*incx; \
\
		/* Compute the absolute value (or complex magnitude) of chi1. */ \
		PASTEMAC2(ch,chr,abval2s)( *chi1, abs_chi1 ); \
\
		/* Accumulate the absolute value of chi1 into absum. */ \
		PASTEMAC(chr,adds)( abs_chi1, absum ); \
	} \
\
	/* Store final value of absum to the output variable. */ \
	PASTEMAC(chr,copys)( absum, *norm ); \
}

INSERT_GENTFUNCR_BASIC( norm1v_unb_var1 )


#undef  GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
     ( \
       dim_t    n, \
       ctype*   x, inc_t incx, \
       ctype_r* norm, \
       cntx_t*  cntx, \
       rntm_t*  rntm  \
     ) \
{ \
	ctype_r* zero       = PASTEMAC(chr,0); \
	ctype_r* one        = PASTEMAC(chr,1); \
	ctype_r  scale; \
	ctype_r  sumsq; \
	ctype_r  sqrt_sumsq; \
\
	/* Initialize scale and sumsq to begin the summation. */ \
	PASTEMAC(chr,copys)( *zero, scale ); \
	PASTEMAC(chr,copys)( *one,  sumsq ); \
\
	/* Compute the sum of the squares of the vector. */ \
	PASTEMAC(ch,kername) \
	( \
	  n, \
	  x, incx, \
	  &scale, \
	  &sumsq, \
	  cntx, \
	  rntm  \
	); \
\
	/* Compute: norm = scale * sqrt( sumsq ) */ \
	PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
	PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
\
	/* Store the final value to the output variable. */ \
	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
}

//INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 )
GENTFUNCR( scomplex, float,  c, s, normfv_unb_var1, sumsqv_unb_var1 )
GENTFUNCR( dcomplex, double, z, d, normfv_unb_var1, sumsqv_unb_var1 )

#undef  GENTFUNCR
// We've disabled the dotv-based implementation because that method of
// computing the sum of the squares of x inherently does not check for
// overflow. Instead, we use the fallback method based on sumsqv, which
// takes care to not overflow unnecessarily (ie: takes care for the
// sqrt( sum of the squares of x ) to not overflow if the sum of the
// squares of x would normally overflow. See GitHub issue #332 for
// discussion.
#if 0 //defined(FE_OVERFLOW) && !defined(__APPLE__)
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
     ( \
       dim_t    n, \
       ctype*   x, inc_t incx, \
       ctype_r* norm, \
       cntx_t*  cntx, \
       rntm_t*  rntm  \
     ) \
{ \
	ctype_r* zero       = PASTEMAC(chr,0); \
	ctype_r* one        = PASTEMAC(chr,1); \
	ctype_r  scale; \
	ctype_r  sumsq; \
	ctype_r  sqrt_sumsq; \
\
	/* Initialize scale and sumsq to begin the summation. */ \
	PASTEMAC(chr,copys)( *zero, scale ); \
	PASTEMAC(chr,copys)( *one,  sumsq ); \
\
	/* An optimization: first try to use dotv to compute the sum of
	   the squares of the vector. If no floating-point exceptions
	   (specifically, overflow and invalid exceptions) were produced,
	   then we accept the computed value and returne early. The cost
	   of this optimization is the "sunk" cost of the initial dotv
	   when sumsqv must be used instead. However, we expect that the
	   vast majority of use cases will not produce exceptions, and
	   therefore only one pass through the data, via dotv, will be
	   required. */ \
	if ( TRUE ) \
	{ \
		int      f_exp_raised;\
		ctype    sumsqc; \
\
		feclearexcept( FE_ALL_EXCEPT );\
\
		PASTEMAC2(ch,dotv,BLIS_TAPI_EX_SUF) \
		( \
		  BLIS_NO_CONJUGATE, \
		  BLIS_NO_CONJUGATE, \
		  n,\
		  x, incx, \
		  x, incx, \
		  &sumsqc, \
		  cntx, \
		  rntm  \
		); \
\
		PASTEMAC2(ch,chr,copys)( sumsqc, sumsq ); \
\
		f_exp_raised = fetestexcept( FE_OVERFLOW | FE_INVALID );\
\
		if ( !f_exp_raised ) \
		{ \
		    PASTEMAC(chr,sqrt2s)( sumsq, *norm ); \
		    return; \
		} \
	} \
\
	/* Compute the sum of the squares of the vector. */ \
	PASTEMAC(ch,kername) \
	( \
	  n, \
	  x, incx, \
	  &scale, \
	  &sumsq, \
	  cntx, \
	  rntm  \
	); \
\
	/* Compute: norm = scale * sqrt( sumsq ) */ \
	PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
	PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
\
	/* Store the final value to the output variable. */ \
	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
}
#else
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
     ( \
       dim_t    n, \
       ctype*   x, inc_t incx, \
       ctype_r* norm, \
       cntx_t*  cntx, \
       rntm_t*  rntm  \
     ) \
{ \
	ctype_r* zero       = PASTEMAC(chr,0); \
	ctype_r* one        = PASTEMAC(chr,1); \
	ctype_r  scale; \
	ctype_r  sumsq; \
	ctype_r  sqrt_sumsq; \
\
	/* Initialize scale and sumsq to begin the summation. */ \
	PASTEMAC(chr,copys)( *zero, scale ); \
	PASTEMAC(chr,copys)( *one,  sumsq ); \
\
	/* Compute the sum of the squares of the vector. */ \
\
	PASTEMAC(ch,kername) \
	( \
	  n, \
	  x, incx, \
	  &scale, \
	  &sumsq, \
	  cntx, \
	  rntm  \
	); \
\
	/* Compute: norm = scale * sqrt( sumsq ) */ \
	PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
	PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
\
	/* Store the final value to the output variable. */ \
	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
}
#endif
GENTFUNCR( float,   float,  s, s, normfv_unb_var1, sumsqv_unb_var1 )
GENTFUNCR( double,  double, d, d, normfv_unb_var1, sumsqv_unb_var1 )


#undef  GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
     ( \
       dim_t    n, \
       ctype*   x, inc_t incx, \
       ctype_r* norm, \
       cntx_t*  cntx, \
       rntm_t*  rntm  \
     ) \
{ \
	ctype*  chi1; \
	ctype_r abs_chi1; \
	ctype_r abs_chi1_max; \
	dim_t   i; \
\
	/* Initialize the maximum absolute value to zero. */ \
	PASTEMAC(chr,set0s)( abs_chi1_max ); \
\
	for ( i = 0; i < n; ++i ) \
	{ \
		chi1 = x + (i  )*incx; \
\
		/* Compute the absolute value (or complex magnitude) of chi1. */ \
		PASTEMAC2(ch,chr,abval2s)( *chi1, abs_chi1 ); \
\
		/* If the absolute value of the current element exceeds that of
		   the previous largest, save it and its index. If NaN is
		   encountered, then treat it the same as if it were a valid
		   value that was larger than any previously seen. This
		   behavior mimics that of LAPACK's ?lange(). */ \
		if ( abs_chi1_max < abs_chi1 || bli_isnan( abs_chi1 ) ) \
		{ \
			PASTEMAC(chr,copys)( abs_chi1, abs_chi1_max ); \
		} \
	} \
\
	/* Store the final value to the output variable. */ \
	PASTEMAC(chr,copys)( abs_chi1_max, *norm ); \
}

INSERT_GENTFUNCR_BASIC( normiv_unb_var1 )


#undef  GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
     ( \
       doff_t   diagoffx, \
       diag_t   diagx, \
       uplo_t   uplox, \
       dim_t    m, \
       dim_t    n, \
       ctype*   x, inc_t rs_x, inc_t cs_x, \
       ctype_r* norm, \
       cntx_t*  cntx, \
       rntm_t*  rntm  \
     ) \
{ \
	ctype*  one = PASTEMAC(ch,1); \
	ctype*  x0; \
	ctype*  chi1; \
	ctype*  x2; \
	ctype_r absum_max; \
	ctype_r absum_j; \
	ctype_r abval_chi1; \
	uplo_t  uplox_eff; \
	dim_t   n_iter; \
	dim_t   n_elem, n_elem_max; \
	inc_t   ldx, incx; \
	dim_t   j, i; \
	dim_t   ij0, n_shift; \
\
	/* Initialize the maximum absolute column sum to zero. */ \
	PASTEMAC(chr,set0s)( absum_max ); \
\
	/* If either dimension is zero, return with absum_max equal to zero. */ \
	if ( bli_zero_dim2( m, n ) ) \
	{ \
		PASTEMAC(chr,copys)( absum_max, *norm ); \
		return; \
	} \
\
	/* Set various loop parameters. */ \
	bli_set_dims_incs_uplo_1m_noswap \
	( \
	  diagoffx, BLIS_NONUNIT_DIAG, \
	  uplox, m, n, rs_x, cs_x, \
	  &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, \
	  &ij0, &n_shift \
	); \
\
	/* If the matrix is zeros, return with absum_max equal to zero. */ \
	if ( bli_is_zeros( uplox_eff ) ) \
	{ \
		PASTEMAC(chr,copys)( absum_max, *norm ); \
		return; \
	} \
\
\
	/* Handle dense and upper/lower storage cases separately. */ \
	if ( bli_is_dense( uplox_eff ) ) \
	{ \
		for ( j = 0; j < n_iter; ++j ) \
		{ \
			n_elem = n_elem_max; \
\
			x0     = x + (j  )*ldx + (0  )*incx; \
\
			/* Compute the norm of the current column. */ \
			PASTEMAC(ch,kername) \
			( \
			  n_elem, \
			  x0, incx, \
			  &absum_j, \
			  cntx, \
			  rntm  \
			); \
\
			/* If absum_j is greater than the previous maximum value,
			   then save it. */ \
			if ( absum_max < absum_j || bli_isnan( absum_j ) ) \
			{ \
				PASTEMAC(chr,copys)( absum_j, absum_max ); \
			} \
		} \
	} \
	else \
	{ \
		if ( bli_is_upper( uplox_eff ) ) \
		{ \
			for ( j = 0; j < n_iter; ++j ) \
			{ \
				n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
\
				x0     = x + (ij0+j  )*ldx + (0       )*incx; \
				chi1   = x + (ij0+j  )*ldx + (n_elem-1)*incx; \
\
				/* Compute the norm of the super-diagonal elements. */ \
				PASTEMAC(ch,kername) \
				( \
				  n_elem - 1, \
				  x0, incx, \
				  &absum_j, \
				  cntx, \
				  rntm  \
				); \
\
				if ( bli_is_unit_diag( diagx ) ) chi1 = one; \
\
				/* Handle the diagonal element separately in case it's
				   unit. */ \
				PASTEMAC2(ch,chr,abval2s)( *chi1, abval_chi1 ); \
				PASTEMAC(chr,adds)( abval_chi1, absum_j ); \
\
				/* If absum_j is greater than the previous maximum value,
				   then save it. */ \
				if ( absum_max < absum_j || bli_isnan( absum_j ) ) \
				{ \
					PASTEMAC(chr,copys)( absum_j, absum_max ); \
				} \
			} \
		} \
		else if ( bli_is_lower( uplox_eff ) ) \
		{ \
			for ( j = 0; j < n_iter; ++j ) \
			{ \
				i      = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
				n_elem = n_elem_max - i; \
\
				chi1   = x + (j  )*ldx + (ij0+i  )*incx; \
				x2     = x + (j  )*ldx + (ij0+i+1)*incx; \
\
				/* Compute the norm of the sub-diagonal elements. */ \
				PASTEMAC(ch,kername) \
				( \
				  n_elem - 1, \
				  x2, incx, \
				  &absum_j, \
				  cntx, \
				  rntm  \
				); \
\
				if ( bli_is_unit_diag( diagx ) ) chi1 = one; \
\
				/* Handle the diagonal element separately in case it's
				   unit. */ \
				PASTEMAC2(ch,chr,abval2s)( *chi1, abval_chi1 ); \
				PASTEMAC(chr,adds)( abval_chi1, absum_j ); \
\
				/* If absum_j is greater than the previous maximum value,
				   then save it. */ \
				if ( absum_max < absum_j || bli_isnan( absum_j ) ) \
				{ \
					PASTEMAC(chr,copys)( absum_j, absum_max ); \
				} \
			} \
		} \
	} \
\
	/* Store final value of absum_max to the output variable. */ \
	PASTEMAC(chr,copys)( absum_max, *norm ); \
}

INSERT_GENTFUNCR_BASIC( norm1m_unb_var1, norm1v_unb_var1 )


#undef  GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
     ( \
       doff_t   diagoffx, \
       diag_t   diagx, \
       uplo_t   uplox, \
       dim_t    m, \
       dim_t    n, \
       ctype*   x, inc_t rs_x, inc_t cs_x, \
       ctype_r* norm, \
       cntx_t*  cntx, \
       rntm_t*  rntm  \
     ) \
{ \
	ctype*   one    = PASTEMAC(ch,1); \
	ctype_r* one_r  = PASTEMAC(chr,1); \
	ctype_r* zero_r = PASTEMAC(chr,0); \
	ctype*   x0; \
	ctype*   chi1; \
	ctype*   x2; \
	ctype_r  scale; \
	ctype_r  sumsq; \
	ctype_r  sqrt_sumsq; \
	uplo_t   uplox_eff; \
	dim_t    n_iter; \
	dim_t    n_elem, n_elem_max; \
	inc_t    ldx, incx; \
	dim_t    j, i; \
	dim_t    ij0, n_shift; \
\
	/* Return a norm of zero if either dimension is zero. */ \
	if ( bli_zero_dim2( m, n ) ) \
	{ \
		PASTEMAC(chr,set0s)( *norm ); \
		return; \
	} \
\
	/* Set various loop parameters. Here, we pretend that diagx is equal to
	   BLIS_NONUNIT_DIAG because we handle the unit diagonal case manually. */ \
	bli_set_dims_incs_uplo_1m \
	( \
	  diagoffx, BLIS_NONUNIT_DIAG, \
	  uplox, m, n, rs_x, cs_x, \
	  &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, \
	  &ij0, &n_shift \
	); \
\
	/* Check the effective uplo; if it's zeros, then our norm is zero. */ \
	if ( bli_is_zeros( uplox_eff ) ) \
	{ \
		PASTEMAC(chr,set0s)( *norm ); \
		return; \
	} \
\
	/* Initialize scale and sumsq to begin the summation. */ \
	PASTEMAC(chr,copys)( *zero_r, scale ); \
	PASTEMAC(chr,copys)( *one_r,  sumsq ); \
\
	/* Handle dense and upper/lower storage cases separately. */ \
	if ( bli_is_dense( uplox_eff ) ) \
	{ \
		for ( j = 0; j < n_iter; ++j ) \
		{ \
			n_elem = n_elem_max; \
\
			x0     = x + (j  )*ldx + (0  )*incx; \
\
			/* Compute the norm of the current column. */ \
			PASTEMAC(ch,kername) \
			( \
			  n_elem, \
			  x0, incx, \
			  &scale, \
			  &sumsq, \
			  cntx, \
			  rntm  \
			); \
		} \
	} \
	else \
	{ \
		if ( bli_is_upper( uplox_eff ) ) \
		{ \
			for ( j = 0; j < n_iter; ++j ) \
			{ \
				n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
\
				x0     = x + (ij0+j  )*ldx + (0       )*incx; \
				chi1   = x + (ij0+j  )*ldx + (n_elem-1)*incx; \
\
				/* Sum the squares of the super-diagonal elements. */ \
				PASTEMAC(ch,kername) \
				( \
				  n_elem - 1, \
				  x0, incx, \
				  &scale, \
				  &sumsq, \
				  cntx, \
				  rntm  \
				); \
\
				if ( bli_is_unit_diag( diagx ) ) chi1 = one; \
\
				/* Handle the diagonal element separately in case it's
				   unit. */ \
				PASTEMAC(ch,kername) \
				( \
				  1, \
				  chi1, incx, \
				  &scale, \
				  &sumsq, \
				  cntx, \
				  rntm  \
				); \
			} \
		} \
		else if ( bli_is_lower( uplox_eff ) ) \
		{ \
			for ( j = 0; j < n_iter; ++j ) \
			{ \
				i      = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
				n_elem = n_elem_max - i; \
\
				chi1   = x + (j  )*ldx + (ij0+i  )*incx; \
				x2     = x + (j  )*ldx + (ij0+i+1)*incx; \
\
				/* Sum the squares of the sub-diagonal elements. */ \
				PASTEMAC(ch,kername) \
				( \
				  n_elem - 1, \
				  x2, incx, \
				  &scale, \
				  &sumsq, \
				  cntx, \
				  rntm  \
				); \
\
				if ( bli_is_unit_diag( diagx ) ) chi1 = one; \
\
				/* Handle the diagonal element separately in case it's
				   unit. */ \
				PASTEMAC(ch,kername) \
				( \
				  1, \
				  chi1, incx, \
				  &scale, \
				  &sumsq, \
				  cntx, \
				  rntm  \
				); \
			} \
		} \
	} \
\
	/* Compute: norm = scale * sqrt( sumsq ) */ \
	PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
	PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
\
	/* Store the final value to the output variable. */ \
	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
}

INSERT_GENTFUNCR_BASIC( normfm_unb_var1, sumsqv_unb_var1 )


#undef  GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
\
void PASTEMAC(ch,varname) \
     ( \
       doff_t   diagoffx, \
       diag_t   diagx, \
       uplo_t   uplox, \
       dim_t    m, \
       dim_t    n, \
       ctype*   x, inc_t rs_x, inc_t cs_x, \
       ctype_r* norm, \
       cntx_t*  cntx, \
       rntm_t*  rntm  \
     ) \
{ \
	/* Induce a transposition so that rows become columns. */ \
	bli_swap_dims( &m, &n ); \
	bli_swap_incs( &rs_x, &cs_x ); \
	bli_toggle_uplo( &uplox ); \
	bli_negate_diag_offset( &diagoffx ); \
\
	/* Now we can simply compute the 1-norm of this transposed matrix,
	   which will be equivalent to the infinity-norm of the original
	   matrix. */ \
	PASTEMAC(ch,kername) \
	( \
	  diagoffx, \
	  diagx, \
	  uplox, \
	  m, \
	  n, \
	  x, rs_x, cs_x, \
	  norm, \
	  cntx, \
	  rntm  \
	); \
}

INSERT_GENTFUNCR_BASIC( normim_unb_var1, norm1m_unb_var1 )


#undef  GENTFUNC
#define GENTFUNC( ctype, ch, varname, randmac ) \
\
void PASTEMAC(ch,varname) \
     ( \
       dim_t   n, \
       ctype*  x, inc_t incx, \
       cntx_t* cntx, \
       rntm_t* rntm  \
     ) \
{ \
	ctype* chi1; \
	dim_t  i; \
\
	chi1 = x; \
\
	for ( i = 0; i < n; ++i ) \
	{ \
		PASTEMAC(ch,randmac)( *chi1 ); \
\
		chi1 += incx; \
	} \
}

INSERT_GENTFUNC_BASIC( randv_unb_var1,  rands )
INSERT_GENTFUNC_BASIC( randnv_unb_var1, randnp2s )


#undef  GENTFUNC
#define GENTFUNC( ctype, ch, varname, kername ) \
\
void PASTEMAC(ch,varname) \
     ( \
       doff_t  diagoffx, \
       uplo_t  uplox, \
       dim_t   m, \
       dim_t   n, \
       ctype*  x, inc_t rs_x, inc_t cs_x, \
       cntx_t* cntx, \
       rntm_t* rntm  \
     ) \
{ \
	ctype* one = PASTEMAC(ch,1); \
	ctype* x0; \
	ctype* x1; \
	ctype* x2; \
	ctype* chi1; \
	ctype  beta; \
	ctype  omega; \
	double max_m_n; \
	uplo_t uplox_eff; \
	dim_t  n_iter; \
	dim_t  n_elem, n_elem_max; \
	inc_t  ldx, incx; \
	dim_t  j, i; \
	dim_t  ij0, n_shift; \
\
	/* Set various loop parameters. Here, we pretend that diagx is equal to
	   BLIS_NONUNIT_DIAG because we handle the unit diagonal case manually. */ \
	bli_set_dims_incs_uplo_1m \
	( \
	  diagoffx, BLIS_NONUNIT_DIAG, \
	  uplox, m, n, rs_x, cs_x, \
	  &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, \
	  &ij0, &n_shift \
	); \
\
	if ( bli_is_zeros( uplox_eff ) ) return; \
\
	/* Handle dense and upper/lower storage cases separately. */ \
	if ( bli_is_dense( uplox_eff ) ) \
	{ \
		for ( j = 0; j < n_iter; ++j ) \
		{ \
			n_elem = n_elem_max; \
\
			x1     = x + (j  )*ldx + (0  )*incx; \
\
			/*PASTEMAC2(ch,kername,BLIS_TAPI_EX_SUF)*/ \
			PASTEMAC(ch,kername) \
			( \
			  n_elem, \
			  x1, incx, \
			  cntx, \
			  rntm  \
			); \
		} \
	} \
	else \
	{ \
		max_m_n = bli_max( m, n ); \
\
		PASTEMAC2(d,ch,sets)( max_m_n, 0.0, omega ); \
		PASTEMAC(ch,copys)( *one, beta ); \
		PASTEMAC(ch,invscals)( omega, beta ); \
\
		if ( bli_is_upper( uplox_eff ) ) \
		{ \
			for ( j = 0; j < n_iter; ++j ) \
			{ \
				n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
\
				x1     = x + (ij0+j  )*ldx + (0  )*incx; \
				x0     = x1; \
				chi1   = x1 + (n_elem-1)*incx; \
\
				/*PASTEMAC2(ch,kername,BLIS_TAPI_EX_SUF)*/ \
				PASTEMAC(ch,kername) \
				( \
				  n_elem, \
				  x1, incx, \
				  cntx, \
				  rntm  \
				); \
\
				( void )x0; \
				( void )chi1; \
				/* We want positive diagonal elements between 1 and 2. */ \
/*
				PASTEMAC(ch,abval2s)( *chi1, *chi1 ); \
				PASTEMAC(ch,adds)( *one, *chi1 ); \
*/ \
\
				/* Scale the super-diagonal elements by 1/max(m,n). */ \
/*
				PASTEMAC(ch,scalv) \
				( \
				  BLIS_NO_CONJUGATE, \
				  n_elem - 1, \
				  &beta, \
				  x0, incx, \
				  cntx  \
				); \
*/ \
			} \
		} \
		else if ( bli_is_lower( uplox_eff ) ) \
		{ \
			for ( j = 0; j < n_iter; ++j ) \
			{ \
				i      = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
				n_elem = n_elem_max - i; \
\
				x1     = x + (j  )*ldx + (ij0+i  )*incx; \
				x2     = x1 + incx; \
				chi1   = x1; \
\
				/*PASTEMAC2(ch,kername,BLIS_TAPI_EX_SUF)*/ \
				PASTEMAC(ch,kername) \
				( \
				  n_elem, \
				  x1, incx, \
				  cntx, \
				  rntm  \
				); \
\
				( void )x2; \
				( void )chi1; \
				/* We want positive diagonal elements between 1 and 2. */ \
/*
				PASTEMAC(ch,abval2s)( *chi1, *chi1 ); \
				PASTEMAC(ch,adds)( *one, *chi1 ); \
*/ \
\
				/* Scale the sub-diagonal elements by 1/max(m,n). */ \
/*
				PASTEMAC(ch,scalv) \
				( \
				  BLIS_NO_CONJUGATE, \
				  n_elem - 1, \
				  &beta, \
				  x2, incx, \
				  cntx  \
				); \
*/ \
			} \
		} \
	} \
}

INSERT_GENTFUNC_BASIC( randm_unb_var1,  randv_unb_var1 )
INSERT_GENTFUNC_BASIC( randnm_unb_var1, randnv_unb_var1 )


#undef  GENTFUNCR
#define GENTFUNCR( ctype, ctype_r, ch, chr, varname ) \
\
void PASTEMAC(ch,varname) \
     ( \
       dim_t    n, \
       ctype*   x, inc_t incx, \
       ctype_r* scale, \
       ctype_r* sumsq, \
       cntx_t*  cntx, \
       rntm_t*  rntm  \
     ) \
{ \
	ctype_r zero_r = *PASTEMAC(chr,0); \
	ctype_r one_r  = *PASTEMAC(chr,1); \
\
	ctype*  chi1; \
	ctype_r chi1_r; \
	ctype_r chi1_i; \
	ctype_r scale_r; \
	ctype_r sumsq_r; \
	ctype_r abs_chi1_r; \
	ctype_r abs_chi1_i; \
	dim_t   i; \
\
	/* NOTE: This function attempts to mimic the algorithm for computing
	   the Frobenius norm in netlib LAPACK's ?lassq(). */ \
\
	/* Copy scale and sumsq to local variables. */ \
	PASTEMAC(chr,copys)( *scale, scale_r ); \
	PASTEMAC(chr,copys)( *sumsq, sumsq_r ); \
\
	chi1 = x; \
\
	for ( i = 0; i < n; ++i ) \
	{ \
		/* Get the real and imaginary components of chi1. */ \
		PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
\
		abs_chi1_r = bli_fabs( chi1_r ); \
		abs_chi1_i = bli_fabs( chi1_i ); \
\
		if ( bli_isnan( abs_chi1_r ) ) \
		{ \
			sumsq_r = abs_chi1_r; \
			scale_r = one_r; \
		} \
\
		if ( bli_isnan( abs_chi1_i ) ) \
		{ \
			sumsq_r = abs_chi1_i; \
			scale_r = one_r; \
		} \
\
		if ( bli_isnan( sumsq_r ) ) \
		{ \
			chi1 += incx; \
			continue; \
		} \
\
		if ( bli_isinf( abs_chi1_r ) ) \
		{ \
			sumsq_r = abs_chi1_r; \
			scale_r = one_r; \
		} \
\
		if ( bli_isinf( abs_chi1_i ) ) \
		{ \
			sumsq_r = abs_chi1_i; \
			scale_r = one_r; \
		} \
\
		if ( bli_isinf( sumsq_r ) ) \
		{ \
			chi1 += incx; \
			continue; \
		} \
\
		/* Accumulate real component into sumsq, adjusting scale if
		   needed. */ \
		if ( abs_chi1_r > zero_r ) \
		{ \
			if ( scale_r < abs_chi1_r ) \
			{ \
				sumsq_r = one_r + \
				          sumsq_r * ( scale_r / abs_chi1_r ) * \
				                    ( scale_r / abs_chi1_r );  \
\
				PASTEMAC(chr,copys)( abs_chi1_r, scale_r ); \
			} \
			else \
			{ \
				sumsq_r = sumsq_r + ( abs_chi1_r / scale_r ) * \
				                    ( abs_chi1_r / scale_r );  \
			} \
		} \
\
		/* Accumulate imaginary component into sumsq, adjusting scale if
		   needed. */ \
		if ( abs_chi1_i > zero_r ) \
		{ \
			if ( scale_r < abs_chi1_i ) \
			{ \
				sumsq_r = one_r + \
				          sumsq_r * ( scale_r / abs_chi1_i ) * \
				                    ( scale_r / abs_chi1_i );  \
\
				PASTEMAC(chr,copys)( abs_chi1_i, scale_r ); \
			} \
			else \
			{ \
				sumsq_r = sumsq_r + ( abs_chi1_i / scale_r ) * \
				                    ( abs_chi1_i / scale_r );  \
			} \
		} \
\
		chi1 += incx; \
	} \
\
	/* Store final values of scale and sumsq to output variables. */ \
	PASTEMAC(chr,copys)( scale_r, *scale ); \
	PASTEMAC(chr,copys)( sumsq_r, *sumsq ); \
}

INSERT_GENTFUNCR_BASIC( sumsqv_unb_var1 )

// -----------------------------------------------------------------------------

#undef  GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
bool PASTEMAC(ch,opname) \
     ( \
       conj_t conjx, \
       dim_t  n, \
       ctype* x, inc_t incx, \
       ctype* y, inc_t incy  \
     ) \
{ \
	for ( dim_t i = 0; i < n; ++i ) \
	{ \
		ctype* chi1 = x + (i  )*incx; \
		ctype* psi1 = y + (i  )*incy; \
\
		ctype chi1c; \
\
		if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *chi1, chi1c ); } \
		else                        { PASTEMAC(ch,copys)( *chi1, chi1c ); } \
\
		if ( !PASTEMAC(ch,eq)( chi1c, *psi1 ) ) \
			return FALSE; \
	} \
\
	return TRUE; \
}

INSERT_GENTFUNC_BASIC( eqv_unb_var1 )


#undef  GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
bool PASTEMAC(ch,opname) \
     ( \
       doff_t  diagoffx, \
       diag_t  diagx, \
       uplo_t  uplox, \
       trans_t transx, \
       dim_t   m, \
       dim_t   n, \
       ctype*  x, inc_t rs_x, inc_t cs_x, \
       ctype*  y, inc_t rs_y, inc_t cs_y  \
     ) \
{ \
	uplo_t   uplox_eff; \
	conj_t   conjx; \
	dim_t    n_iter; \
	dim_t    n_elem_max; \
	inc_t    ldx, incx; \
	inc_t    ldy, incy; \
	dim_t    ij0, n_shift; \
\
	/* Set various loop parameters. */ \
	bli_set_dims_incs_uplo_2m \
	( \
	  diagoffx, diagx, transx, \
	  uplox, m, n, rs_x, cs_x, rs_y, cs_y, \
	  &uplox_eff, &n_elem_max, &n_iter, &incx, &ldx, &incy, &ldy, \
	  &ij0, &n_shift \
	); \
\
	/* In the odd case where we are comparing against a complete unstored
	   matrix, we assert equality. Why? We assume the matrices are equal
	   unless we can find two corresponding elements that are unequal. So
	   if there are no elements, there is no inequality. Granted, this logic
	   is strange to think about no matter what, and thankfully it should
	   never be used under normal usage. */ \
	if ( bli_is_zeros( uplox_eff ) ) return TRUE; \
\
	/* Extract the conjugation component from the transx parameter. */ \
	conjx = bli_extract_conj( transx ); \
\
	/* Handle dense and upper/lower storage cases separately. */ \
	if ( bli_is_dense( uplox_eff ) ) \
	{ \
		for ( dim_t j = 0; j < n_iter; ++j ) \
		{ \
			const dim_t n_elem = n_elem_max; \
\
			ctype* x1 = x + (j  )*ldx + (0  )*incx; \
			ctype* y1 = y + (j  )*ldy + (0  )*incy; \
\
			for ( dim_t i = 0; i < n_elem; ++i ) \
			{ \
				ctype* x11 = x1 + (i  )*incx; \
				ctype* y11 = y1 + (i  )*incy; \
				ctype  x11c; \
\
				if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \
				else                        { PASTEMAC(ch,copys)( *x11, x11c ); } \
\
				if ( !PASTEMAC(ch,eq)( x11c, *y11 ) ) \
					return FALSE; \
			} \
		} \
	} \
	else \
	{ \
		if ( bli_is_upper( uplox_eff ) ) \
		{ \
			for ( dim_t j = 0; j < n_iter; ++j ) \
			{ \
				const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
\
				ctype* x1 = x + (ij0+j  )*ldx + (0  )*incx; \
				ctype* y1 = y + (ij0+j  )*ldy + (0  )*incy; \
\
				for ( dim_t i = 0; i < n_elem; ++i ) \
				{ \
					ctype* x11 = x1 + (i  )*incx; \
					ctype* y11 = y1 + (i  )*incy; \
					ctype  x11c; \
\
					if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \
					else                        { PASTEMAC(ch,copys)( *x11, x11c ); } \
\
					if ( !PASTEMAC(ch,eq)( x11c, *y11 ) ) \
						return FALSE; \
				} \
			} \
		} \
		else if ( bli_is_lower( uplox_eff ) ) \
		{ \
			for ( dim_t j = 0; j < n_iter; ++j ) \
			{ \
				const dim_t offi   = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
				const dim_t n_elem = n_elem_max - offi; \
\
				ctype* x1 = x + (j  )*ldx + (ij0+offi  )*incx; \
				ctype* y1 = y + (j  )*ldy + (ij0+offi  )*incy; \
\
				for ( dim_t i = 0; i < n_elem; ++i ) \
				{ \
					ctype* x11 = x1 + (i  )*incx; \
					ctype* y11 = y1 + (i  )*incy; \
					ctype  x11c; \
\
					if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \
					else                        { PASTEMAC(ch,copys)( *x11, x11c ); } \
\
					if ( !PASTEMAC(ch,eq)( x11c, *y11 ) ) \
						return FALSE; \
				} \
			} \
		} \
	} \
\
	return TRUE; \
}

INSERT_GENTFUNC_BASIC( eqm_unb_var1 )


#undef  GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
     ( \
             FILE*  file, \
       const char*  s1, \
             dim_t  n, \
       const ctype* x, inc_t incx, \
       const char*  format, \
       const char*  s2  \
     ) \
{ \
	const char default_spec[32] = PASTEMAC(ch,formatspec)(); \
\
	if ( format == NULL ) format = default_spec; \
\
	const ctype*chi1 = x; \
\
	fprintf( file, "%s\n", s1 ); \
\
	for ( dim_t i = 0; i < n; ++i ) \
	{ \
		PASTEMAC(ch,fprints)( file, format, *chi1 ); \
		fprintf( file, "\n" ); \
\
		chi1 += incx; \
	} \
\
	fprintf( file, "%s\n", s2 ); \
}

INSERT_GENTFUNC_BASIC_I( fprintv )


#undef  GENTFUNC
#define GENTFUNC( ctype, ch, opname ) \
\
void PASTEMAC(ch,opname) \
     ( \
             FILE*  file, \
       const char*  s1, \
             dim_t  m, \
             dim_t  n, \
       const ctype* x, inc_t rs_x, inc_t cs_x, \
       const char*  format, \
       const char*  s2  \
     ) \
{ \
	const char default_spec[32] = PASTEMAC(ch,formatspec)(); \
\
	if ( format == NULL ) format = default_spec; \
\
	fprintf( file, "%s\n", s1 ); \
\
	for ( dim_t i = 0; i < m; ++i ) \
	{ \
		for ( dim_t j = 0; j < n; ++j ) \
		{ \
			const ctype* chi1 = (( ctype* ) x) + i*rs_x + j*cs_x; \
\
			PASTEMAC(ch,fprints)( file, format, *chi1 ); \
			fprintf( file, " " ); \
		} \
\
		fprintf( file, "\n" ); \
	} \
\
	fprintf( file, "%s\n", s2 ); \
	fflush( file ); \
}

INSERT_GENTFUNC_BASIC_I( fprintm )